|
|
|
/***************************************************************************
|
|
|
|
* Copyright (C) 2004 by Paulo Moura Guedes *
|
|
|
|
* moura@tdewebdev.org *
|
|
|
|
* *
|
|
|
|
* This program is free software; you can redistribute it and/or modify *
|
|
|
|
* it under the terms of the GNU General Public License as published by *
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or *
|
|
|
|
* (at your option) any later version. *
|
|
|
|
* *
|
|
|
|
* This program is distributed in the hope that it will be useful, *
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
|
|
* GNU General Public License for more details. *
|
|
|
|
* *
|
|
|
|
* You should have received a copy of the GNU General Public License *
|
|
|
|
* along with this program; if not, write to the *
|
|
|
|
* Free Software Foundation, Inc., *
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
|
|
|
|
***************************************************************************/
|
|
|
|
|
|
|
|
#ifndef GESTOR_PESTQUISA_H
|
|
|
|
#define GESTOR_PESTQUISA_H
|
|
|
|
|
|
|
|
#include <kurl.h>
|
|
|
|
|
|
|
|
#include <tqobject.h>
|
|
|
|
#include <tqstring.h>
|
|
|
|
#include <tqdatetime.h>
|
|
|
|
#include <tqregexp.h>
|
|
|
|
#include <tqmap.h>
|
|
|
|
class TQDomElement;
|
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "linkstatus.h"
|
|
|
|
#include "linkchecker.h"
|
|
|
|
#include "../parser/node.h"
|
|
|
|
#include "../parser/url.h"
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
typedef TQMap<TQString, KHTMLPart*> KHTMLPartMap;
|
|
|
|
|
|
|
|
class SearchManager: public TQObject
|
|
|
|
{
|
|
|
|
Q_OBJECT
|
|
|
|
TQ_OBJECT
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
enum SearchMode {
|
|
|
|
depth,
|
|
|
|
domain,
|
|
|
|
depth_and_domain
|
|
|
|
};
|
|
|
|
|
|
|
|
SearchManager(int max_simultaneous_connections = 3, int time_out = 50,
|
|
|
|
TQObject *parent = 0, const char *name = 0);
|
|
|
|
~SearchManager();
|
|
|
|
|
|
|
|
TQString toXML() const;
|
|
|
|
void save(TQDomElement& element) const;
|
|
|
|
|
|
|
|
KHTMLPartMap const& htmlParts() const { return html_parts_; }
|
|
|
|
|
|
|
|
KHTMLPart* htmlPart(TQString const& key_url) const;
|
|
|
|
void addHtmlPart(TQString const& key_url, KHTMLPart* html_part);
|
|
|
|
void removeHtmlParts();
|
|
|
|
|
|
|
|
void startSearch(KURL const& root);
|
|
|
|
void startSearch(KURL const& root, SearchMode const& modo);
|
|
|
|
void resume();
|
|
|
|
void cancelSearch();
|
|
|
|
|
|
|
|
bool hasDocumentRoot() const;
|
|
|
|
KURL const& documentRoot() const;
|
|
|
|
void setDocumentRoot(KURL const& url);
|
|
|
|
|
|
|
|
void setSearchMode(SearchMode modo);
|
|
|
|
void setDepth(int depth);
|
|
|
|
void setExternalDomainDepth(int depth);
|
|
|
|
void setDomain(TQString const& domain);
|
|
|
|
void setCheckParentDirs(bool flag);
|
|
|
|
void setCheckExternalLinks(bool flag);
|
|
|
|
void setCheckRegularExpressions(bool flag);
|
|
|
|
void setRegularExpression(TQString const& reg_exp, bool case_sensitive);
|
|
|
|
void setTimeOut(int time_out);
|
|
|
|
|
|
|
|
void cleanItems();
|
|
|
|
void reset();
|
|
|
|
|
|
|
|
bool searching() const;
|
|
|
|
bool localDomain(KURL const& url, bool restrict = true) const;
|
|
|
|
//bool isLocalRestrict(KURL const& url) const;
|
|
|
|
SearchMode const& searchMode() const;
|
|
|
|
bool checkRegularExpressions() const { return check_regular_expressions_; }
|
|
|
|
bool existUrl(KURL const& url, KURL const& url_parent) const;
|
|
|
|
LinktqStatus const* linktqStatus(TQString const& s_url) const;
|
|
|
|
int checkedLinks() const;
|
|
|
|
TQTime timeElapsed() const;
|
|
|
|
bool checkParentDirs() const;
|
|
|
|
bool checkExternalLinks() const;
|
|
|
|
LinktqStatus const* linkStatusRoot() const;
|
|
|
|
int maxSimultaneousConnections() const;
|
|
|
|
int timeOut() const;
|
|
|
|
|
|
|
|
bool sendIdentification() const { return send_identification_; }
|
|
|
|
TQString const& userAgent() const { return user_agent_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
void checkRoot();
|
|
|
|
void checkVectorLinks(vector<LinktqStatus*> const& links); // corresponde a um no de um nivel de depth
|
|
|
|
vector<LinktqStatus*> tqchildren(LinktqStatus* link);
|
|
|
|
void startSearch();
|
|
|
|
void continueSearch();
|
|
|
|
void finnish();
|
|
|
|
void pause();
|
|
|
|
vector<LinktqStatus*> const& nodeToAnalize() const;
|
|
|
|
vector<LinktqStatus*> chooseLinks(vector<LinktqStatus*> const& links);
|
|
|
|
void checkLinksSimultaneously(vector<LinktqStatus*> const& links);
|
|
|
|
void addLevel();
|
|
|
|
bool checkableByDomain(KURL const& url, LinktqStatus const& link_parent) const;
|
|
|
|
bool checkable(KURL const& url, LinktqStatus const& link_parent) const;
|
|
|
|
int maximumCurrentConnections() const;
|
|
|
|
bool onlyCheckHeader(LinktqStatus* ls) const;
|
|
|
|
|
|
|
|
/*
|
|
|
|
Entende-se por domain vago um domain do tipo www.google.pt ou google.pt, pelo que,
|
|
|
|
por exemplo, imagens.google.pt, e considerado estar no mesmo domain.
|
|
|
|
pwp.netcabo.pt ou www.google.pt/imagens nao sao considerados domains vagos.
|
|
|
|
*/
|
|
|
|
bool generalDomain() const;
|
|
|
|
bool generalDomainChecked() const; // Para garantir que o procedimento generalDomain() so e chamado uma vez
|
|
|
|
|
|
|
|
private slots:
|
|
|
|
|
|
|
|
void slotRootChecked(const LinktqStatus * link, LinkChecker * checker);
|
|
|
|
void slotLinkChecked(const LinktqStatus * link, LinkChecker * checker);
|
|
|
|
void slotSearchFinished();
|
|
|
|
void slotLinkCheckerFinnished(LinkChecker * checker);
|
|
|
|
|
|
|
|
signals:
|
|
|
|
|
|
|
|
void signalRootChecked(const LinktqStatus * link, LinkChecker * checker);
|
|
|
|
void signalLinkChecked(const LinktqStatus * link, LinkChecker * checker);
|
|
|
|
void signalSearchFinished();
|
|
|
|
void signalSearchPaused();
|
|
|
|
void signalAddingLevelTotalSteps(uint number_of_links);
|
|
|
|
void signalAddingLevelProgress();
|
|
|
|
void signalLinksToCheckTotalSteps(uint links_to_check);
|
|
|
|
//void signalLinksToCheckProgress();
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
int max_simultaneous_connections_;
|
|
|
|
SearchMode search_mode_;
|
|
|
|
LinktqStatus root_;
|
|
|
|
bool has_document_root_;
|
|
|
|
KURL document_root_url_; // in case of non http protocols the document root must be explicitly given
|
|
|
|
int depth_;
|
|
|
|
int current_depth_;
|
|
|
|
int external_domain_depth_;
|
|
|
|
int current_node_;
|
|
|
|
int current_index_;
|
|
|
|
int links_being_checked_;
|
|
|
|
int finished_connections_;
|
|
|
|
int maximum_current_connections_;
|
|
|
|
TQRegExp reg_exp_;
|
|
|
|
TQString domain_;
|
|
|
|
bool general_domain_;
|
|
|
|
bool checked_general_domain_;
|
|
|
|
int time_out_;
|
|
|
|
int current_connections_;
|
|
|
|
bool send_identification_; // user-agent
|
|
|
|
TQString user_agent_;
|
|
|
|
|
|
|
|
bool canceled_;
|
|
|
|
bool searching_;
|
|
|
|
int checked_links_;
|
|
|
|
TQTime time_;
|
|
|
|
int ignored_links_;
|
|
|
|
bool check_parent_dirs_;
|
|
|
|
bool check_external_links_;
|
|
|
|
bool check_regular_expressions_;
|
|
|
|
uint number_of_level_links_;
|
|
|
|
uint number_of_links_to_check_;
|
|
|
|
vector< vector< vector <LinktqStatus*> > > search_results_;
|
|
|
|
KHTMLPartMap html_parts_;
|
|
|
|
};
|
|
|
|
|
|
|
|
#include "searchmanager_impl.h"
|
|
|
|
|
|
|
|
#endif
|