tdewebdev/klinkstatus/src/parser/htmlparser.h

	/***************************************************************************
 *   Copyright (C) 2004 by Paulo Moura Guedes                              *
 *   moura@kdewebdev.org                                                        *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.             *
 ***************************************************************************/

#ifndef HTML_PARSER_H
#define HTML_PARSER_H

#include <tqstring.h>

#include <vector>


#include "mstring.h"
#include "node.h"

#include <iostream>

using namespace std;

typedef unsigned int uint;


class HtmlParser
{
public:

  HtmlParser();
  HtmlParser(TQString const& documento);
  ~HtmlParser();

  vector<Node*> const& nodes() const;
  bool hasBaseUrl() const;
  bool hasTitle() const;
  bool hasContentType() const;
  NodeBASE const& baseUrl() const;
  NodeTITLE const& title() const;
  NodeMETA const& contentTypeMetaNode() const;

  static uint estimativaLinks(uint doc_size);
  /**
   * Convenience function for performance as it only parse in order 
   * to get the charset.
   */
  static TQString findCharsetInMetaElement(TQString const& html);

  // test:
  void mostra() const;

private:

  vector<TQString> const& parseNodesOfType(TQString const& element);
  /**
   * Vector nodes passed for performance.
   */
  static void parseNodesOfType(TQString const& element, TQString const& doc, vector<TQString>& nodes);

  void parseNodesOfTypeA();
  void parseNodesOfTypeAREA();
  void parseNodesOfTypeLINK();
  void parseNodesOfTypeMETA();
  void parseNodesOfTypeIMG();
  void parseNodesOfTypeFRAME();
  void parseNodesOfTypeIFRAME();
  void parseNodesOfTypeBASE();
  void parseNodesOfTypeTITLE();

  void stripComments();
  void stripScriptContent();

  /**
     Return the index of the next character of the end of tag.
     e.g.
     endOfTag("<img src=\"bad > luck\">") => 22 (not 15)
  */
  static int endOfTag(TQString const& s, int index = 0, TQChar end_of_tag = '>');

private:

  vector<TQString> aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember
  vector<Node*> nodes_;
  NodeBASE node_BASE_;
  NodeTITLE node_TITLE_;
  NodeMETA node_META_content_type_;
  bool is_content_type_set_;

  TQString document_;
  TQString script_; // Fica aqui guardado (JavaScript, etc)
  TQString comments_;
};


inline HtmlParser::~HtmlParser()
{
  //kdDebug(23100) <<  "*";
}

inline uint HtmlParser::estimativaLinks(uint doc_size)
{
  return doc_size / 100; // valor estimado...
}

inline bool HtmlParser::hasContentType() const 
{
    return is_content_type_set_;
}

#endif
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`/***************************************************************************`
			`* Copyright (C) 2004 by Paulo Moura Guedes *`
Additional k => tde renaming and fixes 11 years ago			`* moura@kdewebdev.org *`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`* *`
			`* This program is free software; you can redistribute it and/or modify *`
			`* it under the terms of the GNU General Public License as published by *`
			`* the Free Software Foundation; either version 2 of the License, or *`
			`* (at your option) any later version. *`
			`* *`
			`* This program is distributed in the hope that it will be useful, *`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of *`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *`
			`* GNU General Public License for more details. *`
			`* *`
			`* You should have received a copy of the GNU General Public License *`
			`* along with this program; if not, write to the *`
			`* Free Software Foundation, Inc., *`
			`* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *`
			`***************************************************************************/`

			`#ifndef HTML_PARSER_H`
			`#define HTML_PARSER_H`

Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`#include <tqstring.h>`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago
			`#include <vector>`


			`#include "mstring.h"`
			`#include "node.h"`

			`#include <iostream>`

			`using namespace std;`

			`typedef unsigned int uint;`



			`class HtmlParser`
			`{`
			`public:`

			`HtmlParser();`
Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`HtmlParser(TQString const& documento);`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`~HtmlParser();`

			`vector<Node*> const& nodes() const;`
			`bool hasBaseUrl() const;`
			`bool hasTitle() const;`
			`bool hasContentType() const;`
			`NodeBASE const& baseUrl() const;`
			`NodeTITLE const& title() const;`
			`NodeMETA const& contentTypeMetaNode() const;`

			`static uint estimativaLinks(uint doc_size);`
			`/**`
			`* Convenience function for performance as it only parse in order`
			`* to get the charset.`
			`*/`
Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`static TQString findCharsetInMetaElement(TQString const& html);`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago
			`// test:`
			`void mostra() const;`

			`private:`

Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`vector<TQString> const& parseNodesOfType(TQString const& element);`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`/**`
			`* Vector nodes passed for performance.`
			`*/`
Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`static void parseNodesOfType(TQString const& element, TQString const& doc, vector<TQString>& nodes);`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago
			`void parseNodesOfTypeA();`
			`void parseNodesOfTypeAREA();`
			`void parseNodesOfTypeLINK();`
			`void parseNodesOfTypeMETA();`
			`void parseNodesOfTypeIMG();`
			`void parseNodesOfTypeFRAME();`
			`void parseNodesOfTypeIFRAME();`
			`void parseNodesOfTypeBASE();`
			`void parseNodesOfTypeTITLE();`

			`void stripComments();`
			`void stripScriptContent();`

			`/**`
			`Return the index of the next character of the end of tag.`
			`e.g.`
			`endOfTag("<img src=\"bad > luck\">") => 22 (not 15)`
			`*/`
Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`static int endOfTag(TQString const& s, int index = 0, TQChar end_of_tag = '>');`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago
			`private:`

Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`vector<TQString> aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`vector<Node*> nodes_;`
			`NodeBASE node_BASE_;`
			`NodeTITLE node_TITLE_;`
			`NodeMETA node_META_content_type_;`
			`bool is_content_type_set_;`

Trinity Qt initial conversion git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1157656 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`TQString document_;`
			`TQString script_; // Fica aqui guardado (JavaScript, etc)`
			`TQString comments_;`
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da 15 years ago			`};`


			`inline HtmlParser::~HtmlParser()`
			`{`
			`//kdDebug(23100) << "*";`
			`}`

			`inline uint HtmlParser::estimativaLinks(uint doc_size)`
			`{`
			`return doc_size / 100; // valor estimado...`
			`}`

			`inline bool HtmlParser::hasContentType() const`
			`{`
			`return is_content_type_set_;`
			`}`

			`#endif`