You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

937 lines
23 KiB

//
// URL.cc
//
// URL: A URL parsing class, implementing as closely as possible the standard
// laid out in RFC2396 (e.g. http://www.faqs.org/rfcs/rfc2396.html)
// including support for multiple services. (schemes in the RFC)
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: URL.cc,v 1.16 2004/06/04 08:51:01 angusgb Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "URL.h"
#include "QuotedStringList.h"
#include "Dictionary.h"
#include "HtConfiguration.h"
#include "StringMatch.h"
#include "StringList.h"
#include "HtURLRewriter.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef HAVE_STD
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <fstream.h>
#endif /* HAVE_STD */
#include <sys/types.h>
#include <ctype.h>
#ifndef _MSC_VER /* _WIN32 */
#include <sys/socket.h>
#include <netdb.h>
#include <arpa/inet.h>
#endif
#define NNTP_DEFAULT_PORT 119
static Dictionary *slashCount = 0;
//*****************************************************************************
// URL::URL()
// Default Constructor
//
URL::URL()
: _url(0),
_path(0),
_service(0),
_host(0),
_port(0),
_normal(0),
_hopcount(0),
_signature(0),
_user(0)
{
}
//*****************************************************************************
// URL::URL(const URL& rhs)
// Copy constructor
//
URL::URL(const URL& rhs)
: _url(rhs._url),
_path(rhs._path),
_service(rhs._service),
_host(rhs._host),
_port(rhs._port),
_normal(rhs._normal),
_hopcount(rhs._hopcount),
_signature(rhs._signature),
_user(rhs._user)
{
}
//*****************************************************************************
// URL::URL(const String &nurl)
// Construct a URL from a String (obviously parses the string passed in)
//
URL::URL(const String &nurl)
: _url(0),
_path(0),
_service(0),
_host(0),
_port(0),
_normal(0),
_hopcount(0),
_signature(0),
_user(0)
{
parse(nurl);
}
//*****************************************************************************
// Assignment operator
const URL &URL::operator = (const URL &rhs)
{
if (this == &rhs)
return *this;
// Copy the attributes
_url = rhs._url;
_path = rhs._path;
_service = rhs._service;
_host = rhs._host;
_port = rhs._port;
_normal = rhs._normal;
_hopcount = rhs._hopcount;
_signature = rhs._signature;
_user = rhs._user;
return *this;
}
//*****************************************************************************
// URL::URL(const String &url, const URL &parent)
// Parse a reference given a parent url. This is needed to resolve relative
// references which do NOT have a full url.
//
URL::URL(const String &url, const URL &parent)
: _url(0),
_path(0),
_service(parent._service),
_host(parent._host),
_port(parent._port),
_normal(parent._normal),
_hopcount(parent._hopcount + 1), // Since this is one hop *after* the parent, we should account for this
_signature(parent._signature),
_user(parent._user)
{
HtConfiguration* config= HtConfiguration::config();
int allowspace = config->Boolean("allow_space_in_url", 0);
String temp;
const char *urp = url.get();
while (*urp)
{
if (*urp == ' ' && temp.length() > 0 && allowspace)
{
// Replace space character with %20 if there's more non-space
// characters to come...
const char *s = urp+1;
while (*s && isspace(*s))
s++;
if (*s)
temp << "%20";
}
else if (!isspace(*urp))
temp << *urp;
urp++;
}
char* ref = temp;
//
// Strip any optional anchor from the reference. If, however, the
// reference contains CGI parameters after the anchor, the parameters
// will be moved left to replace the anchor. The overall effect is that
// the anchor is removed.
// Thanks goes to David Filiatrault <dwf@WebThreads.Com> for suggesting
// this removal process.
//
char *anchor = strchr(ref, '#');
char *params = strchr(ref, '?');
if (anchor)
{
*anchor = '\0';
if (params)
{
if (anchor < params)
{
while (*params)
{
*anchor++ = *params++;
}
*anchor = '\0';
}
}
}
//
// If, after the removal of a possible '#' we have nothing left,
// we just want to use the base URL (we're on the same page but
// different anchors)
//
if (!*ref)
{
// We've already copied much of the info
_url = parent._url;
_path = parent._path;
// Since this is on the same page, we want the same hopcount
_hopcount = parent._hopcount;
return;
}
// OK, now we need to work out what type of child URL this is
char *p = ref;
while (isalpha(*p)) // Skip through the service portion
p++;
int hasService = (*p == ':');
// Why single out http? Shouldn't others be the same?
// Child URL of the form https:/child or ftp:child called "full"
// How about using slashes()?
if (hasService && ((strncmp(ref, "http://", 7) == 0) ||
(strncmp(ref, "http:", 5) != 0)))
{
//
// No need to look at the parent url since this is a complete url...
//
parse(ref);
}
else if (strncmp(ref, "//", 2) == 0)
{
// look at the parent url's _service, to make this is a complete url...
String fullref(parent._service);
fullref << ':' << ref;
parse((char*)fullref);
}
else
{
if (hasService)
ref = p + 1; // Relative URL, skip "http:"
if (*ref == '/')
{
//
// The reference is on the same server as the parent, but
// an absolute path was given...
//
_path = ref;
//
// Get rid of loop-causing constructs in the path
//
normalizePath();
}
else
{
//
// The reference is relative to the parent
//
_path = parent._path;
int i = _path.indexOf('?');
if (i >= 0)
{
_path.chop(_path.length() - i);
}
//
// Remove any leading "./" sequences which could get us into
// recursive loops.
//
while (strncmp(ref, "./", 2) == 0)
ref += 2;
if (_path.last() == '/')
{
//
// Parent was a directory. Easy enough: just append
// the current ref to it
//
_path << ref;
}
else
{
//
// Parent was a file. We need to strip the last part
// of the path before we add the reference to it.
//
String temp = _path;
p = strrchr((char*)temp, '/');
if (p)
{
p[1] = '\0';
_path = temp.get();
_path << ref;
}
else
{
//
// Something must be wrong since there were no '/'
// found in the parent url.
//
// We do nothing here. The new url is the parent.
//
}
}
//
// Get rid of loop-causing constructs in the path
//
normalizePath();
}
//
// Build the url. (Note, the host name has NOT been normalized!)
// No need for this if we have called URL::parse.
//
constructURL();
}
}
//*****************************************************************************
// void URL::rewrite()
//
void URL::rewrite()
{
if (HtURLRewriter::instance()->replace(_url) > 0)
parse(_url.get());
}
//*****************************************************************************
// void URL::parse(const String &u)
// Given a URL string, extract the service, host, port, and path from it.
//
void URL::parse(const String &u)
{
HtConfiguration* config= HtConfiguration::config();
int allowspace = config->Boolean("allow_space_in_url", 0);
String temp;
const char *urp = u.get();
while (*urp)
{
if (*urp == ' ' && temp.length() > 0 && allowspace)
{
// Replace space character with %20 if there's more non-space
// characters to come...
const char *s = urp+1;
while (*s && isspace(*s))
s++;
if (*s)
temp << "%20";
}
else if (!isspace(*urp))
temp << *urp;
urp++;
}
char *nurl = temp;
//
// Ignore any part of the URL that follows the '#' since this is just
// an index into a document.
//
char *p = strchr(nurl, '#');
if (p)
*p = '\0';
// Some members need to be reset. If not, the caller would
// have used URL::URL(char *ref, URL &parent)
// (which may call us, if the URL is found to be absolute).
_normal = 0;
_signature = 0;
_user = 0;
//
// Extract the service
//
p = strchr(nurl, ':');
if (p)
{
_service = strtok(nurl, ":");
p = strtok(0, "\n");
}
else
{
_service = "http";
p = strtok(nurl, "\n");
}
_service.lowercase();
//
// Extract the host
//
if (!p || strncmp(p, "//", 2) != 0)
{
// No host specified, it's all a path.
_host = 0;
_port = 0;
_url = 0;
if (p) // if non-NULL, skip (some) leading slashes in path
{
int i;
for (i = slashes (_service); i > 0 && *p == '/'; i--)
p++;
if (i) // if fewer slashes than specified for protocol don't
// delete any. -> Backwards compatible (necessary??)
p -= slashes (_service) - i;
}
_path = p;
if (strcmp((char*)_service, "file") == 0 || slashes (_service) < 2)
_host = "localhost";
}
else
{
p += 2;
//
// p now points to the host
//
char *q = strchr(p, ':');
char *slash = strchr(p, '/');
_path = "/";
if (strcmp((char*)_service, "file") == 0)
{
// These should be of the form file:/// (i.e. no host)
// if there is a file://host/path then strip the host
if (strncmp(p, "/", 1) != 0)
{
p = strtok(p, "/");
_path << strtok(0, "\n");
}
else
_path << strtok(p+1, "\n"); // _path is "/" - don't double
_host = "localhost";
_port = 0;
}
else if (q && ((slash && slash > q) || !slash))
{
_host = strtok(p, ":");
p = strtok(0, "/");
if (p)
_port = atoi(p);
if (!p || _port <= 0)
_port = DefaultPort();
//
// The rest of the input string is the path.
//
_path << strtok(0, "\n");
}
else
{
_host = strtok(p, "/");
_host.chop(" \t");
_port = DefaultPort();
//
// The rest of the input string is the path.
//
_path << strtok(0, "\n");
}
// Check to see if host contains a user@ portion
int atMark = _host.indexOf('@');
if (atMark != -1)
{
_user = _host.sub(0, atMark);
_host = _host.sub(atMark + 1);
}
}
//
// Get rid of loop-causing constructs in the path
//
normalizePath();
//
// Build the url. (Note, the host name has NOT been normalized!)
//
constructURL();
}
//*****************************************************************************
// void URL::normalizePath()
// Called from: URL(const String &url, const URL &parent)
//
void URL::normalizePath()
{
//
// Rewrite the path to be the minimal.
// Remove "//", "/../" and "/./" components
//
HtConfiguration* config= HtConfiguration::config();
int i, limit;
int leadingdotdot = 0;
String newPath;
int pathend = _path.indexOf('?'); // Don't mess up query strings.
if (pathend < 0)
pathend = _path.length();
//
// get rid of "//" first, or "/foo//../" will become "/foo/" not "/"
// Some database lookups interpret empty paths (// != /), so give
// the use the option to turn this off.
//
if (!config->Boolean ("allow_double_slash"))
while ((i = _path.indexOf("//")) >= 0 && i < pathend)
{
newPath = _path.sub(0, i).get();
newPath << _path.sub(i + 1).get();
_path = newPath;
pathend = _path.indexOf('?');
if (pathend < 0)
pathend = _path.length();
}
//
// Next get rid of redundant "/./". This could cause infinite
// loops. Moreover, "/foo/./../" should become "/", not "/foo/"
//
while ((i = _path.indexOf("/./")) >= 0 && i < pathend)
{
newPath = _path.sub(0, i).get();
newPath << _path.sub(i + 2).get();
_path = newPath;
pathend = _path.indexOf('?');
if (pathend < 0)
pathend = _path.length();
}
if ((i = _path.indexOf("/.")) >= 0 && i == pathend-2)
{
newPath = _path.sub(0, i+1).get(); // keep trailing slash
newPath << _path.sub(i + 2).get();
_path = newPath;
pathend--;
}
//
// Now that "empty" path components are gone, remove ("/../").
//
while ((i = _path.indexOf("/../")) >= 0 && i < pathend)
{
if ((limit = _path.lastIndexOf('/', i - 1)) >= 0)
{
newPath = _path.sub(0, limit).get();
newPath << _path.sub(i + 3).get();
_path = newPath;
}
else
{
_path = _path.sub(i + 3).get();
leadingdotdot++;
}
pathend = _path.indexOf('?');
if (pathend < 0)
pathend = _path.length();
}
if ((i = _path.indexOf("/..")) >= 0 && i == pathend-3)
{
if ((limit = _path.lastIndexOf('/', i - 1)) >= 0)
newPath = _path.sub(0, limit+1).get(); // keep trailing slash
else
{
newPath = '/';
leadingdotdot++;
}
newPath << _path.sub(i + 3).get();
_path = newPath;
pathend = _path.indexOf('?');
if (pathend < 0)
pathend = _path.length();
}
// The RFC gives us a choice of what to do when we have .. left and
// we're at the top level. By principle of least surprise, we'll just
// toss any "leftovers" Otherwise, we'd have a loop here to add them.
// Finally change all "%7E" to "~" for sanity
while ((i = _path.indexOf("%7E")) >= 0 && i < pathend)
{
newPath = _path.sub(0, i).get();
newPath << "~";
newPath << _path.sub(i + 3).get();
_path = newPath;
pathend = _path.indexOf('?');
if (pathend < 0)
pathend = _path.length();
}
// If the server *isn't* case sensitive, we want to lowercase the path
if (!config->Boolean("case_sensitive", 1))
_path.lowercase();
// And don't forget to remove index.html or similar file.
// if (strcmp((char*)_service, "file") != 0) (check is now internal)
removeIndex(_path, _service);
}
//*****************************************************************************
// void URL::dump()
//
void URL::dump()
{
cout << "service = " << _service.get() << endl;
cout << "user = " << _user.get() << endl;
cout << "host = " << _host.get() << endl;
cout << "port = " << _port << endl;
cout << "path = " << _path << endl;
cout << "url = " << _url << endl;
}
//*****************************************************************************
// void URL::path(const String &newpath)
//
void URL::path(const String &newpath)
{
HtConfiguration* config= HtConfiguration::config();
_path = newpath;
if (!config->Boolean("case_sensitive",1))
_path.lowercase();
constructURL();
}
//*****************************************************************************
// void URL::removeIndex(String &path, String &service)
// Attempt to remove the remove_default_doc from the end of a URL path if
// the service allows that. (File, ftp don't. Do others?)
// This needs to be done to normalize the paths and make .../ the
// same as .../index.html
// Called from: URL::normalize() from URL::signature() [redundant?]
// URL::normalizePath()
//
void URL::removeIndex(String &path, String &service)
{
HtConfiguration* config= HtConfiguration::config();
static StringMatch *defaultdoc = 0;
if (strcmp((char*)_service, "file") == 0 ||
strcmp((char*)_service, "ftp") == 0)
return;
if (path.length() == 0 || strchr((char*)path, '?'))
return;
int filename = path.lastIndexOf('/') + 1;
if (filename == 0)
return;
if (! defaultdoc)
{
StringList l(config->Find("remove_default_doc"), " \t");
defaultdoc = new StringMatch();
defaultdoc->IgnoreCase();
defaultdoc->Pattern(l.Join('|'));
}
int which, length;
if (defaultdoc->hasPattern() &&
defaultdoc->CompareWord((char*)path.sub(filename), which, length) &&
filename+length == path.length())
path.chop(path.length() - filename);
}
//*****************************************************************************
// void URL::normalize()
// Make sure that URLs are always in the same format.
//
void URL::normalize()
{
HtConfiguration* config= HtConfiguration::config();
static int hits = 0, misses = 0;
if (_service.length() == 0 || _normal)
return;
// if (strcmp((char*)_service, "http") != 0)
// if service specifies "doesn't specify an IP host", don't normalize it
if (slashes (_service) != 2)
return;
// if (strcmp ((char*)_service, "http") == 0) (check is now internal)
removeIndex(_path, _service);
//
// Convert a hostname to an IP address
//
_host.lowercase();
if (!config->Boolean("allow_virtual_hosts", 1))
{
static Dictionary hostbyname;
unsigned long addr;
struct hostent *hp;
String *ip = (String *) hostbyname[_host];
if (ip)
{
memcpy((char *) &addr, ip->get(), ip->length());
hits++;
}
else
{
addr = inet_addr(_host.get());
if (addr == 0xffffffff)
{
hp = gethostbyname(_host.get());
if (hp == NULL)
{
return;
}
memcpy((char *)&addr, (char *)hp->h_addr, hp->h_length);
ip = new String((char *) &addr, hp->h_length);
hostbyname.Add(_host, ip);
misses++;
}
}
static Dictionary machines;
String key;
key << int(addr);
String *realname = (String *) machines[key];
if (realname)
_host = realname->get();
else
machines.Add(key, new String(_host));
}
ServerAlias();
//
// Reconstruct the url
//
constructURL();
_normal = 1;
_signature = 0;
}
//*****************************************************************************
// const String &URL::signature()
// Return a string which uniquely identifies the server the current
// URL is refering to.
// This is the first portion of a url: service://user@host:port/
// (in short this is the URL pointing to the root of this server)
//
const String &URL::signature()
{
if (_signature.length())
return _signature;
if (!_normal)
normalize();
_signature = _service;
_signature << "://";
if (_user.length())
_signature << _user << '@';
_signature << _host;
_signature << ':' << _port << '/';
return _signature;
}
//*****************************************************************************
// void URL::ServerAlias()
// Takes care of the server aliases, which attempt to simplify virtual
// host problems
//
void URL::ServerAlias()
{
HtConfiguration* config= HtConfiguration::config();
static Dictionary *serveraliases= 0;
if (! serveraliases)
{
String l= config->Find("server_aliases");
String from, *to;
serveraliases = new Dictionary();
char *p = strtok(l, " \t");
char *salias= NULL;
while (p)
{
salias = strchr(p, '=');
if (! salias)
{
p = strtok(0, " \t");
continue;
}
*salias++= '\0';
from = p;
from.lowercase();
if (from.indexOf(':') == -1)
from.append(":80");
to= new String(salias);
to->lowercase();
if (to->indexOf(':') == -1)
to->append(":80");
serveraliases->Add(from.get(), to);
// fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get());
p = strtok(0, " \t");
}
}
String *al= 0;
int newport;
int delim;
String serversig = _host;
serversig << ':' << _port;
if ((al= (String *) serveraliases->Find(serversig)))
{
delim= al->indexOf(':');
// fprintf(stderr, "\nOld URL: %s->%s\n", (char *) serversig, (char *) *al);
_host= al->sub(0,delim).get();
sscanf((char*)al->sub(delim+1), "%d", &newport);
_port= newport;
// fprintf(stderr, "New URL: %s:%d\n", (char *) _host, _port);
}
}
//*****************************************************************************
// int URL::slash(const String &protocol)
// Returns number of slashes folowing the service name for protocol
//
int
URL::slashes(const String &protocol)
{
if (!slashCount)
{
HtConfiguration* config= HtConfiguration::config();
slashCount = new Dictionary();
slashCount->Add (String("mailto"), new String("0"));
slashCount->Add (String("news"), new String("0"));
slashCount->Add (String("http"), new String("2"));
slashCount->Add (String("ftp"), new String("2"));
// file:/// has three, but the last counts as part of the path...
slashCount->Add (String("file"), new String("2"));
QuotedStringList qsl(config->Find("external_protocols"), " \t");
String from;
int i;
int sep,colon;
for (i = 0; qsl[i]; i += 2)
{
from = qsl[i];
sep = from.indexOf("->");
if (sep != -1)
from = from.sub(0, sep).get(); // "get" aids portability...
colon = from.indexOf(":");
// if service specified as "help:/" or "man:", note trailing slashes
// Default is 2.
if (colon != -1)
{
int i;
char count [2];
for (i = colon+1; from[i] == '/'; i++)
;
count [0] = i - colon + '0' - 1;
count [1] = '\0';
from = from.sub(0,colon).get();
slashCount->Add (from, new String (count));
} else
slashCount->Add (from, new String ("2"));
}
}
// Default to two slashes for unknown protocols
String *count = (String *)slashCount->Find(protocol);
return count ? (count->get()[0] - '0') : 2;
}
//*****************************************************************************
// void URL::constructURL()
// Constructs the _url member from everything else
// Also ensures the port number is correct for the service
// Called from URL::URL(const String &url, const URL &parent)
// URL::parse(const String &u)
// URL::path(const String &newpath)
// URL::normalize()
//
void URL::constructURL()
{
if (strcmp((char*)_service, "file") != 0 && _host.length() == 0) {
_url = "";
return;
}
_url = _service;
_url << ":";
// Add correct number of slashes after service name
int i;
for (i = slashes (_service); i > 0; i--)
{
_url << "/";
}
if (slashes (_service) == 2) // services specifying a particular
{ // IP host must begin "service://"
if (strcmp((char*)_service, "file") != 0)
{
if (_user.length())
_url << _user << '@';
_url << _host;
}
if (_port != DefaultPort() && _port != 0) // Different than the default port
_url << ':' << _port;
}
_url << _path;
}
///////
// Get the default port for the recognised service
///////
int URL::DefaultPort()
{
if (strcmp((char*)_service, "http") == 0)
return 80;
else if (strcmp((char*)_service, "https") == 0)
return 443;
else if (strcmp((char*)_service, "ftp") == 0)
return 21;
else if (strcmp((char*)_service, "gopher") == 0)
return 70;
else if (strcmp((char*)_service, "file") == 0)
return 0;
else if (strcmp((char*)_service, "news") == 0)
return NNTP_DEFAULT_PORT;
else return 80;
}