You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdewebdev/klinkstatus/src/parser/htmlparser.cpp

456 lines
13 KiB

/***************************************************************************
* Copyright (C) 2004 by Paulo Moura Guedes *
* moura@tdewebdev.org *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include "htmlparser.h"
#include <kapplication.h>
#include <kdebug.h>
HtmlParser::HtmlParser(TQString const& documento)
: is_content_type_set_(false), document_(documento)
{
Q_ASSERT(!documento.isEmpty());
stripScriptContent();
stripComments(); // after removing the script because comments in scripts have diferent sintaxe
nodes_.reserve(estimativaLinks(documento.length() * 2)); // à confiança ;)
parseNodesOfTypeA();
parseNodesOfTypeAREA();
parseNodesOfTypeLINK();
parseNodesOfTypeMETA();
parseNodesOfTypeIMG();
parseNodesOfTypeFRAME();
parseNodesOfTypeIFRAME();
parseNodesOfTypeBASE();
parseNodesOfTypeTITLE();
}
bool HtmlParser::hasBaseUrl() const
{
return (node_BASE_.element() == Node::BASE &&
!node_BASE_.url().isEmpty());
}
NodeBASE const& HtmlParser::baseUrl() const
{
Q_ASSERT(hasBaseUrl());
return node_BASE_;
}
NodeMETA const& HtmlParser::contentTypeMetaNode() const
{
Q_ASSERT(hasContentType());
return node_META_content_type_;
}
bool HtmlParser::hasTitle() const
{
return (node_TITLE_.element() == Node::TITLE &&
!node_TITLE_.attributeTITLE().isEmpty());
}
NodeTITLE const& HtmlParser::title() const
{
Q_ASSERT(hasTitle());
return node_TITLE_;
}
vector<TQString> const& HtmlParser::parseNodesOfType(TQString const& element)
{
HtmlParser::parseNodesOfType(element, document_, aux_);
return aux_;
}
void HtmlParser::parseNodesOfType(TQString const& tipo, TQString const& document, vector<TQString>& nodes)
{
TQString node;
TQString doc(document);
int inicio = 0, fim = 0;
nodes.clear();
if(upperCase(tipo) == "A")
nodes.reserve(estimativaLinks(doc.length() * 2));
while(true)
{
inicio = findSeparableWord(doc, "<" + tipo);
if(inicio == -1)
return;
//if( (doc[inicio] != ' ' && doc[inicio] != '\n' && doc[inicio] != '\r') )
if(!::isSpace(doc[inicio]))
{
doc.remove(0, TQString("<" + tipo).length());
continue;
}
if(upperCase(tipo) == "A")
fim = findWord(doc, "</A>", inicio);
else
{
//fim = findChar(doc, '>', inicio + 1);
fim = endOfTag(doc, inicio, '>');
}
if(fim == -1)
{
doc.remove(0, 1);
continue;
}
int tag_begining_go_back = (tipo.length() + TQString("<").length());
node = doc.mid(inicio - tag_begining_go_back,
fim - inicio + tag_begining_go_back);
nodes.push_back(node);
doc.remove(0, fim);
}
}
int HtmlParser::endOfTag(TQString const& s, int index, TQChar end_of_tag)
{
if( (uint)index >= s.length() )
return -1;
int _end_of_tag = s.find(end_of_tag, index);
if(_end_of_tag == -1)
return _end_of_tag;
int open_aspas = s.find('"', index);
if(open_aspas == -1)
return _end_of_tag + 1;
else if(_end_of_tag < open_aspas)
return _end_of_tag + 1;
else if( ((uint)open_aspas + 1) >= s.length() - 1 )
return -1;
else
{
int close_aspas = s.find('"', open_aspas + 1);
if(close_aspas != -1)
return endOfTag(s, close_aspas + 1, end_of_tag);
else
{
kdDebug(23100) << "Mismatched quotes (\"): " << s.mid(index, _end_of_tag - index) << endl;
//return -1;
return _end_of_tag + 1;
}
}
}
vector<Node*> const& HtmlParser::nodes() const
{
return nodes_;
}
void HtmlParser::parseNodesOfTypeA()
{
vector<TQString> const& aux = parseNodesOfType("A");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
{
nodes_.push_back( new NodeA(aux[i]) );
}
}
void HtmlParser::parseNodesOfTypeAREA()
{
vector<TQString> const& aux = parseNodesOfType("AREA");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
{
nodes_.push_back( new NodeAREA(aux[i]) );
}
}
void HtmlParser::parseNodesOfTypeLINK()
{
vector<TQString> const& aux = parseNodesOfType("LINK");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
nodes_.push_back( new NodeLINK(aux[i]) );
}
void HtmlParser::parseNodesOfTypeMETA()
{
vector<TQString> const& aux = parseNodesOfType("META");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
{
NodeMETA* node = new NodeMETA(aux[i]);
nodes_.push_back(node);
if(!is_content_type_set_ && node->atributoHTTP_EQUIV().lower() == TQString("Content-Type").lower()) {
is_content_type_set_ = true;
node_META_content_type_.setNode(aux[i]);
}
}
}
TQString HtmlParser::findCharsetInMetaElement(TQString const& html)
{
vector<TQString> metaTags;
parseNodesOfType("META", html, metaTags);
for(vector<TQString>::size_type i = 0; i != metaTags.size(); ++i)
{
NodeMETA node(metaTags[i]);
if(node.atributoHTTP_EQUIV().lower() == TQString("Content-Type").lower()) {
return node.charset();
}
}
return TQString();
}
void HtmlParser::parseNodesOfTypeIMG()
{
vector<TQString> const& aux = parseNodesOfType("IMG");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
nodes_.push_back( new NodeIMG(aux[i]) );
}
void HtmlParser::parseNodesOfTypeFRAME()
{
vector<TQString> const& aux = parseNodesOfType("FRAME");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
nodes_.push_back( new NodeFRAME(aux[i]) );
}
void HtmlParser::parseNodesOfTypeIFRAME()
{
vector<TQString> const& aux = parseNodesOfType("IFRAME");
for(vector<TQString>::size_type i = 0; i != aux.size(); ++i)
nodes_.push_back( new NodeFRAME(aux[i]) );
}
void HtmlParser::parseNodesOfTypeBASE()
{
TQString node;
TQString doc = document_;
int inicio = 0, fim = 0;
inicio = findSeparableWord(doc, "<BASE");
if(inicio == -1 || !doc[inicio].isSpace())
return;
fim = doc.find(">", inicio);
if(fim == -1)
return;
node = doc.mid(inicio, fim-inicio);
node_BASE_.setNode(node);
}
void HtmlParser::parseNodesOfTypeTITLE()
{
TQString node;
TQString doc = document_;
int inicio = 0, fim = 0;
inicio = findSeparableWord(doc, "<TITLE>");
if(inicio == -1)
return;
fim = findSeparableWord(doc, "</TITLE>", inicio);
if(fim == -1)
return;
node = doc.mid(inicio, fim-inicio);
node_TITLE_.setNode(node);
}
void HtmlParser::stripComments()
{
TQString begin_comment = "<!--";
TQString end_comment = "-->";
uint const begin_comment_length = begin_comment.length();
int inicio = -1;
do
{
inicio = findWord(document_, begin_comment);
if(inicio != -1)
{
int fim = findWord(document_, end_comment, inicio);
if(fim == -1)
{
kdDebug(23100) << "End of comment is missing!" << endl;
document_.remove(inicio - begin_comment_length, begin_comment_length);
}
else
{
comments_ += "\n" + document_.mid(inicio - begin_comment_length,
fim - inicio + begin_comment_length);
document_.remove(inicio - begin_comment_length, fim - inicio + begin_comment_length);
}
}
}
while(inicio != -1);
}
void HtmlParser::stripScriptContent()
{
int inicio = -1;
TQString const begin_script = "<script";
TQString const end_script = "</script>";
uint const begin_script_length = begin_script.length();
do
{
inicio = findWord(document_, begin_script);
if(inicio != -1)
{
int fim = findWord(document_, end_script, inicio);
if(fim == -1)
{
kdDebug(23100) << "Malformed script tag!" << endl;
document_.remove(inicio - begin_script_length, begin_script_length);
}
else
{
script_ += "\n" + document_.mid(inicio - begin_script_length,
fim - inicio + begin_script_length);
document_.remove(inicio - begin_script_length,
fim - inicio + begin_script_length);
}
}
}
while(inicio != -1);
}
#include <iostream>
void HtmlParser::mostra() const
{
kdDebug(23100) << "\nA:\n\n";
for(unsigned int i = 0; i != nodes_.size(); ++i)
{
if(nodes_[i]->element() == Node::A)
kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl;
}
kdDebug(23100) << "____________________________________________________________________" << endl;
kdDebug(23100) << "\nLINK:\n\n";
for(unsigned int i = 0; i != nodes_.size(); ++i)
{
if(nodes_[i]->element() == Node::LINK)
kdDebug(23100) << nodes_[i]->url() << "\t" << nodes_[i]->linkLabel() << endl;
}
kdDebug(23100) << "____________________________________________________________________" << endl;
kdDebug(23100) << "\nMETA:\n";
for(unsigned int i = 0; i != nodes_.size(); ++i)
{
if(nodes_[i]->element() == Node::META)
{
#if defined TQ_WS_WIN
NodeMETA* nm = (NodeMETA*)nodes_[i];
#else
NodeMETA* nm = dynamic_cast<NodeMETA*>(nodes_[i]);
#endif
kdDebug(23100) << nm->url() << endl
<< nm->atributoHTTP_EQUIV() << endl
<< nm->atributoNAME() << endl
<< nm->atributoCONTENT() << endl;
}
}
kdDebug(23100) << "____________________________________________________________________" << endl;
kdDebug(23100) << "\nIMG:\n\n";
for(unsigned int i = 0; i != nodes_.size(); ++i)
{
if(nodes_[i]->element() == Node::IMG)
kdDebug(23100) << nodes_[i]->url() << "\t"
<< nodes_[i]->linkLabel() << endl;
}
kdDebug(23100) << "____________________________________________________________________" << endl;
kdDebug(23100) << "\nFRAME:\n\n";
for(unsigned int i = 0; i != nodes_.size(); ++i)
{
if(nodes_[i]->element() == Node::FRAME)
kdDebug(23100) << nodes_[i]->url() << endl;
}
kdDebug(23100) << "____________________________________________________________________" << endl;
kdDebug(23100) << "\nBASE:\n\n";
kdDebug(23100) << node_BASE_.url() << endl;
kdDebug(23100) << "____________________________________________________________________" << endl;
}
#ifdef HTMLPARSER
#include <fstream>
int main()
{
//ifstream stream("aterraprometida.html");
//ifstream stream("/var/www/html/STL/standard_library.html");
//ifstream stream("/var/www/html/qt-doc/functions.html");
ifstream stream("/var/www/html/index.html");
TQString content;
while(stream)
{
char c;
stream.get(c);
content += c;
}
// kdDebug(23100) << content << endl;
kdDebug(23100) << "__________________________________________________________" << endl;
HtmlParser parser(content);
parser.mostra();
kdDebug(23100) << "__________________________________________________________\n\n\n" << endl;
vector<Node*> nods = parser.nodes();
for(int i = 0; i != nods.size(); ++i)
{
if(nods[i]->element() == Node::META)
{
NodeMETA* nod_meta = (NodeMETA*)(nods[i]);
//Node* nod_meta = nods[i];
kdDebug(23100) << nod_meta->atributoCONTENT() << endl;
}
}
}
#endif