/* RTF2HTML.c, Chuck Shotton - 6/21/93 */ /************************************************************************ * This program takes a stab at converting RTF (Rich Text Format) files * into HTML. There are some limitations that keep RTF from being able to * easily represent things like in-line images and anchors as styles. In * particular, RTF styles apply to entire "paragraphs", so anchors or * images in the middle of a text stream can't easily be represented by * styles. The intent is to ultimately use something like embedded text * color changes to represent these constructs. * * In the meantime, you can take existing Word documents, apply the * correct style sheet, and convert them to HTML with this tool. * * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, * cshotton@oac.hsc.uth.tmc.edu * * Dmitry Potapov, CapitalSoft * dpotapov@capitalsoft.com * * David Lippi, Comune di Prato, Italy * d.lippi@comune.prato.it * * Gabriele Bartolini, Comune di Prato, Italy * g.bartolini@comune.prato.it * * USAGE: rtf2html [rtf_filename] * * BEHAVIOR: * rtf2html will open the specified RTF input file or read from * standard input, writing converted HTML to standard output. * * NOTES: * The RTF document must be formatted with a style sheet that has * style numberings that conform to the style_mappings table * defined in this source file. Characters are converted according * to the ANSI Windows 1252 code or Macintosh. * * MODIFICATIONS: * 6/21/93 : Chuck Shotton - created version 1.0. * 11/26/98 : Dmitry Potapov - version 1.1 beta * 05/07/04 : David Lippi, Gabriele Bartolini - version 1.2 * * Copyright (C) 2004 Comune di Prato * * For copyright details, see the file COPYING in your distribution * or the GNU General Public License (GPL) version 2 or later * * ************************************************************************/ /* Note, the source is formated with 4 character tabs */ #include #include #include #include #include "charset1252.h" #include "charsetmac.h" #ifdef _MSC_VER # define strcasecmp _stricmp #endif #ifndef TRUE #define TRUE -1 #define FALSE 0 #endif #define MAX_LEVELS 40 /*defines the # of nested in-line styles (pairs of {})*/ #define MAX_RTF_TOKEN 40 #define MAX_INLINE_STYLES 5 /*defines # of in-line styles, bold, italic, etc.*/ typedef struct tag_StyleState { unsigned char s: MAX_INLINE_STYLES; } TStyleState; typedef enum { s_plain, s_bold, s_italic, s_underline, s_hidden, /*in-line styles*/ s_para, s_br, /*pseudo style*/ s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/ } StyleState; char *styles[][2] = { /*HTML Start and end tags for styles*/ {"", ""}, {"", ""}, {"", ""}, {"", ""}, {""}, {"\n", "\n"}, /* {"\n

", "

\n"}, */ {"
\n",""}, {"", ""}, {"

", "

"}, {"

", "

"}, {"

", "

"}, {"

", "

"}, {"
", "
"}, {"
", "
"} }; /* style_mappings maps the style numbers in a RTF style sheet into one of the*/ /* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*/ /* Additional styles for lists, etc. should be added here. Style info */ /* ultimately should be read from some sort of config file into these tables.*/ #define MAX_NAME_LEN 40 char style_name[MAX_NAME_LEN]; #define STYLE_NUMBER 7 char *style_namings[STYLE_NUMBER] = { "", "heading 1", "heading 2", "heading 3", "heading 4", "heading 5", "heading 6" }; char style_mappings[STYLE_NUMBER][MAX_RTF_TOKEN]; char style_number[MAX_RTF_TOKEN]; /* RTF tokens that mean something to the parser. All others are ignored. */ typedef enum { t_start, t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_ul, t_ulw, t_uld, t_uldb, t_i, t_v, t_plain, t_par, t_pict, t_tab, t_bullet, t_cell, t_row, t_line, t_endash, t_emdash, t_rquote, t_end } TokenIndex; char *tokens[] = { "###", "fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "ulw", "uld", "uldb", "i", "v", "plain", "par", "pict", "tab", "bullet", "cell", "row", "line", "endash", "emdash", "rquote", "###" }; TStyleState style_state[MAX_LEVELS], curr_style; short curr_heading; void (*RTF_DoControl)(FILE*,char*,char*); char isBody; char* title; //FILE* f; short level, /*current {} nesting level*/ skip_to_level,/*{} level to which parsing should skip (used to skip */ /* font tables, style sheets, color tables, etc.) */ gobble, /*Flag set to indicate all input should be discarded */ ignore_styles;/*Set to ignore inline style expansions after style use*/ /* Charset */ unsigned char** charset_table; #define CHARSET_DEFAULT 0 // Index of the default charset to use #define CHARSET_NUMBER 2 // Number of charset used #define CHARSET_MAX_LENGTH 20 // Max numbero of char in the charset // metadata used in rtf standard for the charset definition unsigned char *charset[CHARSET_NUMBER] = { "ansi", "mac" }; // variable with the charset definition unsigned char **charset_variablename[CHARSET_NUMBER] = { charset1252, mac }; /**************************************/ int openfile (char * filename, FILE ** f) { int rv = 1; if (filename) { if (!(*f = fopen (filename, "r"))) { fprintf (stderr, "\nError: Input file %s not found.\n", filename); rv = 0; } else { title = filename; } } else { *f = stdin; title="STDIN"; } return rv; } /**************************************/ int closefile (FILE * f) { return fclose (f); } /**************************************/ char RTF_GetChar( FILE* f ) { char ch; do { ch = fgetc( f ); } while ((ch=='\r')||(ch=='\n')); return ch; } /**************************************/ char RTF_UnGetChar(FILE* f, char ch) { return ungetc(ch, f); } /**************************************/ void RTF_PutStr(char* s) { if (gobble) return; fputs(s, stdout); } /**************************************/ void RTF_PutHeader() { RTF_PutStr("\n"); RTF_PutStr(title); RTF_PutStr("\n"); RTF_PutStr("\n"); RTF_PutStr("\n"); } /**************************************/ void RTF_PutChar(char ch) { if (gobble) return; if (!isBody) { RTF_PutHeader(); RTF_PutStr("\n"); isBody=TRUE; } switch (ch) { case '<': RTF_PutStr("<"); break; case '>': RTF_PutStr(">"); break; case '&': RTF_PutStr("&"); break; default: fputc(ch, stdout); } } /**************************************/ void RTF_PlainStyle (TStyleState* s) { int i; for(i=0;is & (1<s=0; } /**************************************/ void RTF_SetStyle(TStyleState* s, StyleState style) { if( (!ignore_styles||(style==s_hidden)) && ((s->s&(1<s|=(1<=MAX_LEVELS) { fprintf(stderr,"Exceed maximum level\n"); exit(-1); } style_state[*level]=curr_style; (*level)++; } /**************************************/ void RTF_PopState(short* level) { int j; TStyleState new_style; if(*level<1) { fprintf(stderr,"RTF parse error: unexpected '}'\n"); exit(-1); } new_style = style_state[*level-1]; /*close off any in-line styles*/ for (j=0;j=MAX_NAME_LEN-1) return; if (is_string) { for (p = ch; p && *p; ++p) { token[len]=*p; ++len; } } else { token[len] = *ch; ++len; } token[len]='\0'; } /**************************************/ void RTF_ClearName(char* token) { token[0]=0; } /**************************************/ TokenIndex GetTokenIndex(char* control) { TokenIndex i; for (i=t_start; i='0')&&(ch<='9')) return ch-'0'; if((ch>='A')&&(ch<='Z')) return ch-'A'+10; if((ch>='a')&&(ch<='z')) return ch-'a'+10; return -1; } /**************************************/ void RTF_BuildArg (FILE * f, char ch, char* arg) { int i=0; if(feof(f)) { arg[0]=0; return; } if(ch=='-') { arg[i++]='-'; ch = RTF_GetChar( f ); if(feof(f)) { arg[0]=0; return; } } for(;isdigit(ch);i++) { arg[i]=ch; if(i>=MAX_RTF_TOKEN-1) { arg[MAX_RTF_TOKEN-1]=0; while(isdigit(ch)) { ch = RTF_GetChar( f ); if(feof(f)) return; } break; } ch = RTF_GetChar( f ); if(feof(f)) { arg[i+1]=0; return; } } arg[i]=0; if(!isspace(ch)) { RTF_UnGetChar(f, ch); } } /**************************************/ void RTF_BuildToken (FILE* f, char ch) { int i; for(i=1;;i++) { char token[MAX_RTF_TOKEN], arg[MAX_RTF_TOKEN]; token[i-1]=ch; if(i>=MAX_RTF_TOKEN-1) { do { ch = RTF_GetChar( f ); if(feof(f)) return; } while (isalpha(ch)); RTF_BuildArg(f, ch,arg); return; } ch = RTF_GetChar( f ); if(feof(f)) { token[i]=0; RTF_DoControl(f,token,""); return; } if( !isalpha(ch) ) { token[i]=0; RTF_BuildArg(f, ch,arg); RTF_DoControl(f,token,arg); return; } } } /**************************************/ void RTF_backslash(FILE* f, char** pch, char* pf) { int ch; *pf=FALSE; ch = RTF_GetChar( f ); if(feof(f)) { fprintf(stderr,"Unexpected end of file\n"); return; } switch (ch) { case '\\': *pch=charset_table[92]; *pf=TRUE; break; case '{': *pch=charset_table[123]; *pf=TRUE; break; case '}': *pch=charset_table[125]; *pf=TRUE; break; case '*': gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ if(skip_to_level>level-1||skip_to_level==-1) skip_to_level = level-1; break; case '\'': { char ch1, ch2; ch1 = RTF_GetChar( f ); ch2 = RTF_GetChar( f ); if(!feof(f)) { if(isxdigit(ch1)&&isxdigit(ch2)) { ch = chartoi(ch1)*16+chartoi(ch2); *pch = charset_table[ch-1]; *pf=TRUE; } else { fprintf(stderr,"RTF Error: unexpected '%c%c' after \\\'\n",ch1,ch2); } } break; } default: if (isalpha(ch)) { RTF_BuildToken(f, ch); } else { fprintf(stderr, "\nRTF Error: unexpected '%c' after \\.\n", ch); } break; } } /**************************************/ void RTF_ParseStyle(FILE * f) { char ch, pf; char *code; int level0; void (*PrevDoControl)(FILE*,char*,char*); level0=level; PrevDoControl=RTF_DoControl; RTF_DoControl=RTF_DoStyleControl; RTF_ClearName(style_name); style_number[0]=0; while (1) { ch = RTF_GetChar( f ); if(feof(f)) break; switch (ch) { case '\\': RTF_backslash(f, &code, &pf); if(pf) { RTF_BuildName(style_name, code, 1); } else { RTF_ClearName(style_name); } break; case '{': level++; RTF_ClearName(style_name); break; case '}': if(level0+1==level) { if(style_number[0]!=0) { RTF_AddStyleMap(style_name,style_number); style_number[0]=0; } } else if(level0==level) { RTF_DoControl=PrevDoControl; RTF_UnGetChar(f, ch); return; } level--; RTF_ClearName(style_name); break; default: RTF_BuildName(style_name, &ch, 0); break; } } /* while */ } /**************************************/ /* Perform actions for RTF control words */ void RTF_DoBodyControl (FILE * f, char* control,char* arg) { short style; if (gobble) return; switch (GetTokenIndex(control)) { case t_stylesheet: gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ skip_to_level = level-1; RTF_ParseStyle( f ); break; case t_fonttbl: /*skip all of these and their contents!*/ case t_colortbl: case t_info: gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ skip_to_level = level-1; break; case t_pict: gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ if(skip_to_level>=level || skip_to_level==-1) skip_to_level = level-1; break; case t_s: /*Style*/ if (!curr_heading) { style = RTF_MapStyle (arg); if(style) { curr_heading = s_h0 + style; RTF_PutStr(styles[curr_heading][0]); ignore_styles = TRUE; } } break; case t_b: /*Bold*/ RTF_SetStyle(&curr_style,s_bold); break; case t_ulw: case t_uld: case t_uldb: case t_ul: /*Underline, maps to "emphasis" HTML style*/ RTF_SetStyle(&curr_style,s_underline); break; case t_i: /*Italic*/ RTF_SetStyle(&curr_style,s_italic); break; case t_v: /* Hidden*/ RTF_SetStyle(&curr_style,s_hidden); break; case t_par: /*Paragraph*/ if (curr_heading!=s_plain) { RTF_PutStr(styles[curr_heading][1]); curr_heading = s_plain; } else { RTF_PutStr(styles[s_para][0]); } ignore_styles = FALSE; break; case t_plain: /*reset inline styles*/ RTF_PlainStyle(&curr_style); break; case t_cell: case t_tab: RTF_PutChar(' '); break; case t_endash: case t_emdash: RTF_PutChar('-'); break; case t_line: case t_row: RTF_PutStr(styles[s_br][0]); break; case t_bullet: RTF_PutChar('\xb7'); break; case t_start: case t_end: break; case t_rquote: //RTF_PutStr("’"); RTF_PutStr("'"); break; } } /**************************************/ /* RTF_Parse is a crude, ugly state machine that understands enough of */ /* the RTF syntax to be dangerous. */ void RTF_ParseBody( FILE* f ) { char ch, pf; char* code; RTF_DoControl=RTF_DoBodyControl; level = 0; skip_to_level = -1; gobble = FALSE; ignore_styles = FALSE; while (1) { ch = RTF_GetChar( f ); if (feof(f)) { break; } switch (ch) { case '\\': RTF_backslash(f, &code,&pf); if(pf && code) RTF_PutStr(code); break; case '{': RTF_PushState(&level); break; case '}': RTF_PopState(&level); break; default: RTF_PutChar(ch); break; } }/*while*/ } /**************************************/ int RTF_Parse (FILE* f) { RTF_PutStr("\n\n"); isBody=FALSE; RTF_ParseBody(f); if (isBody) RTF_PutStr("\n"); RTF_PutStr("\n"); return 0; } /**************************************/ void Initialize() { int i; for (i=0;i 2 ) { code[i] = '\0'; break; } } for ( i = 0; i < CHARSET_NUMBER ; i++) { if ( strcmp( (const char *)charset[i], (const char *) code ) == 0 ) { charset_table = charset_variablename[i]; break; }; } if ( i == CHARSET_NUMBER ) { charset_table = charset_variablename[CHARSET_DEFAULT]; } return 1; // always true! } /**************************************/ int main(int argc,char** argv) { int rv = 0; FILE *f = NULL; Initialize(); if ( argc > 1) { if( strcmp(argv[1],"--help")==0 || strcmp(argv[1],"-H")==0 ) { printf("Use: %s [rtf_filename]\n",argv[0]); rv = 0; } else if ( strcmp(argv[1],"--version")==0 || strcmp(argv[1],"-V")==0 ) { printf("rtf2html version 1.2\n"); rv = 0; } else { rv = openfile(argv[1], &f); if ( rv ) rv = RTF_FindCharset(f); if ( rv ) { rewind(f); rv = RTF_Parse(f); } if ( rv ) rv = closefile(f); } } else { printf("Use: %s [rtf_filename]\n",argv[0]); } return rv; }