#!/usr/bin/perl -w use strict; # # Version 1.0.1 12-Feb-2002 # Written by David Adams # # Uses pdftotext & pdfinfo utilities from the xpdf package # to read an Adobe Acrobat file and produce HTML output. # # Can be called directly from htdig as an external converter, # or may be called by doc2html.pl converter script. # ####--- Configuration ---#### # Full paths of pdtotext and pdfinfo # (get them from the xpdf package at http://www.foolabs.com/xpdf/): #### YOU MUST SET THESE #### my $PDFTOTEXT = "/... .../pdftotext"; my $PDFINFO = "/... .../pdfinfo"; # # De-hyphenation option (only affects end-of-line hyphens): my $Dehyphenate = 1; # # Set title to be used when none is found: my $Default_title = "Adobe Acrobat Document"; # # make portable to win32 platform or unix: my $null = "/dev/null"; if ($^O eq "MSWin32") {$null = "nul";} ####--- End of configuration ---### if (! -x $PDFTOTEXT) { die "Unable to execute pdftotext" } my $Input = $ARGV[0] || die "Usage: pdf2html.pl filename [mime-type] [URL]"; my $MIME_type = $ARGV[1] || ''; if ($MIME_type and ($MIME_type !~ m#^application/pdf#i)) { die "MIME/type $MIME_type wrong"; } my $Name = $ARGV[2] || ''; $Name =~ s#^(.*/)##; # decode if 2nd argument was a URL $Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie if $1; &pdf_head; &pdf_body; exit; #------------------------------------------------------------------------------ sub pdf_head { # # Contributed by Greg Holmes and Michael Fuller # (any errors by David Adams) # my $title = ''; my $subject = ''; my $keywords = ''; if (open(INFO, "$PDFINFO '$Input' 2>$null |")) { while () { if (m/^title:/i) { s/^title:\s+//i; $title = &clean_pdf($_); } elsif (m/^subject:/i) { s/^subject:\s+//i; $subject = &clean_pdf($_); } elsif (m/^keywords:/i) { s/^keywords:\s+//i; $keywords = &clean_pdf($_); } } close INFO; } else { warn "cannot execute pdfinfo" } if (not length $title) { if ($Name) { $title = '[' . $Name . ']'; } else { $title = $Default_title; } } print "\n\n"; print "$title\n"; if (length $subject) { print '\n"; } if (length $keywords) { print '\n"; } print "\n"; ###print STDERR "\n$Name:\n"; ###print STDERR "\tTitle:\t$title\n"; ###print STDERR "\tDescription:\t$subject\n"; ###print STDERR "\tKeywords:\t$keywords\n"; } #------------------------------------------------------------------------------ sub pdf_body { my $bline = ''; open(CAT, "$PDFTOTEXT -raw '$Input' - |") || die "$PDFTOTEXT doesn't want to be opened using pipe\n"; print "\n"; while () { while ( m/[A-Za-z\300-\377]-\s*$/ && $Dehyphenate) { $_ .= ; last if eof; s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s; } s/\255/-/g; # replace dashes with hyphens # replace bell, backspace, tab. etc. with single space: s/[\000-\040]+/ /g; $_ = &HTML($_); if (length) { print $bline, $_, "\n"; $bline = "
\n"; } else { $bline = "

\n"; } } close CAT; print "\n\n"; return; } #------------------------------------------------------------------------------ sub HTML { my $text = shift; $text =~ s/\f/\n/gs; # replace form feed $text =~ s/\s+/ /g; # replace multiple spaces, etc. with a single space $text =~ s/\s+$//gm; # remove trailing space $text =~ s/&/&/g; $text =~ s//>/g; chomp $text; return $text; } #------------------------------------------------------------------------------ sub clean_pdf { # removes odd pair of characters that may be in pdfinfo output # Any double quotes are replaced with single my $text = shift; chomp $text; $text =~ s/\376\377//g; $text =~ s/\"/\'/g; return $text; }