This is supposed to take program sources and convert them to something that Netscape will show nicely. Comments, character strings, and pre-processor directives are shown using the emphasis font, and keywords using the strong font.
You can, if you wish, bracket selected areas of the source between lines containing <doc> and </doc>. Such lines are assumed to contain reasonable HTML text, and will be output "as-is". As a bonus, the characters before the <doc> are remembered and will be removed from the bracketted lines, which means you can even use "here to end-of-line" style comments for your documentation. (The bracket lines, by the way, are suppressed.)
The other pair of bracket lines recognised is <hide>...</hide> which caused the marked text to be quietly dropped from sight.
#include <assert.h> #include <ctype.h> #if defined (__MSDOS__) #include <io.h> #endif #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #if defined (__unix) #include <unistd.h> #endif
First come the basic output routines for program text. The first, char_out deals with the problems of converting HTML meta-characters into entity references. The second, str_out, just calls char_out in a loop. The third, tag_out, is used for changing type face as and when necessary.
static int quiet; static void char_out (int ch) { if (quiet) return; switch (ch) { case '<': fputs ("<", stdout); break; case '>': fputs (">", stdout); break; case '&': fputs ("&", stdout); break; default: putchar (ch); break; } } static void str_out (char *s) { while (*s) char_out (*s++); } static void stag_out (char *s) { if (quiet) return; putchar ('<'); fputs (s, stdout); putchar ('>'); } static void etag_out (char *s) { int ch; if (quiet) return; putchar ('<'); putchar ('/'); while ((ch = *s++) > ' ') putchar (ch); putchar ('>'); }
Now a handful of routines to help classify the possible characters.
enum { id_char = 0x01, /* char occurs in identifiers */ space_char = 0x02, /* char is some type of white space */ punct_char = 0x04, /* operators, brackets, and the like */ quote_char = 0x08, /* start or end strings */ escape_char = 0x10, /* "hide" a quote inside a string */ nul_char = 0x20 /* char is NUL */ }; static char ch_type [UCHAR_MAX + 1]; static void init_from_string (char *p, int t) { unsigned char *q = (unsigned char *) p; int ch; while ((ch = *q++) != 0) if (ch > ' ') ch_type [ch] |= t; } static int type_of (int ch) { return (ch_type [(unsigned char) ch]); } static void init_types (void) { int i; ch_type [0] = nul_char; for (i = 1; i <= ' '; i += 1) ch_type [i] = space_char; ch_type [127] = space_char;
We can usually assume letters and digits are allowed in an identifier
init_from_string ("abcdefghijklmnopqrstuvwxyz", id_char); init_from_string ("ABCDEFGHIJKLMNOPQRSTUVWXYZ", id_char); init_from_string ("0123456789", id_char); } static void set_punct (void) { int i; for (i = 0; i <= UCHAR_MAX; i += 1) if ((ch_type [i] & ~escape_char) == 0) ch_type [i] |= punct_char; }
A wrapper around malloc() or realloc() calls to save a lot of testing later.
static void *ensure (void *p) { if (! p) { fprintf (stderr, "Out of memory\n"); exit (1); } return (p); }
A bit of string cleaning:
static char *trim (char *s) { if (s) { { /* ltrim */ int ch; while ((ch = *s) != '\0' && ch <= ' ') s += 1; } { /* rtrim */ size_t len = strlen (s); while (len && s [--len] <= ' ') s [len] = '\0'; } } return (s); }
Now for the lowest level input routines: get_line and push_back. The get_line routine simply calls fgets a few times until it either has a complete line or it exceeds some random limit (currently 10K). It reads into a malloced buffer, which it frees next time around. You can, of course, push_back a tail of this buffer which will look like the next line to be retrieved. This greatly simplifies a lot of the code later.
static char *pushed_back; static FILE *in_file; static char *kw_name; static void push_back (char *p) { assert (! pushed_back); pushed_back = p; } static char *get_line (void) { char *res = pushed_back; pushed_back = NULL; if (res) return (res); else { static char *line_buf; size_t line_len = 0; if (line_buf) free (line_buf); line_buf = NULL; while (line_len < 10 * 1024) { size_t old_len = line_len; char *p; line_buf = ensure (realloc (line_buf, line_len += 1024)); p = line_buf + old_len; if (! fgets (p, line_len - old_len, in_file)) return (old_len ? line_buf : NULL); old_len = strlen (p); if (! old_len) break; p += old_len - 1; if (*p == '\n') { *p = '\0'; break; } line_len = (size_t) (p - line_buf) + 1; } return (line_buf); } }
The next_token routine assumes that t_end points to the first character worth considering for fetching a new token. This is usually where the previous token ended, hence the name, but can also be the first character of a new line.
The steps are:
static char *t_start, *t_end, *prefix; static int next_token (void) { int this_type; t_start = t_end; while ((this_type = type_of (*t_start)) == space_char) char_out (*t_start++); if (this_type == nul_char) return (nul_char); t_end = t_start; if (this_type == quote_char) { int q = *t_start; for (;;) { int ch = *++t_end; if (ch == '\0') break; else if (ch == q) { t_end += 1; break; } else if ((type_of (ch) & escape_char) && t_end [1]) t_end += 1; } } else { int mask = (this_type & (id_char | punct_char)); while (type_of (*++t_end) & mask) ; } return (this_type); }
Before an identifier is output it's checked to see if it's a keyword, so that the type style can be changed if necessary. The keywords are read from an initialisation file and stored in a traditional linked list.
If the user requires that variable spelling be standardised, I also keep a list of variables that have been seen. Hopefully, this will not be a common occurence, so I feel justified in "cheating" here.
typedef struct keyword { char *spelling; size_t len; struct keyword *next; } keyword; static keyword *kw_base; static keyword *var_base; typedef int (*compare_fn) (const char *a, const char *b, size_t len); static compare_fn kwcomp = (strncmp);
A non-ANSI string compare function that comes in handy
#if defined (__MSDOS__) static compare_fn nocase_cmp = (strnicmp); #elif defined (__unix) static int nocase_cmp (const char *a, const char *b, size_t len) { int res = 0; while (res == 0 && (*a || *b) && len--) res = *a++ - *b++; return (res); } #endif static char *find_keyword (void) { keyword *p = kw_base; size_t len = (size_t) (t_end - t_start); while (p) { if (len == p -> len && kwcomp (p -> spelling, t_start, len) == 0) return (p -> spelling); p = p -> next; } return (NULL); } static char *add_keyword (void) { char *spelling = find_keyword (); if (! spelling) { size_t len = (size_t) (t_end - t_start); keyword *p = ensure (malloc (sizeof (keyword))); spelling = ensure (malloc (len)); p -> next = kw_base; p -> len = len; memcpy (p -> spelling = spelling, t_start, len); kw_base = p; } return (spelling); }
The keywords section of the initialisation file is simply a list of identifiers. These are read (by calling next_token) and stored away in the list, until we get either EOF or something starting '[' (which we'll take as meaning "end-of-section"). Comments start, as usual, with a semicolon. Unusually, this must be in the first column to allow for the odd cases where you really need a semicolon in your keywords.
static void read_keywords (void) { while ((t_end = get_line ()) != NULL) if (*t_end != ';') { int this_type; while ((this_type = next_token ()) != nul_char) { if (*t_start == '[') return; if (this_type == id_char) add_keyword (); } } }
At various points in what follows we'll be changing the "typeface" between normal, bold (actually <strong>), and italic (<em>). In fact, whenever we want bold we'll call set_face(bold), no matter what was selected before. It's up to set_face() to remember what we've got, how to turn it off, and whether to supress redundant switches.
enum { normal, italics, bold }; static char *strong, *em, *pre; static void set_face (int new_face) { static int old_face; if (new_face == old_face) return; switch (old_face) { case normal: break; case italics: etag_out (em); break; case bold: etag_out (strong); break; } switch (old_face = new_face) { case normal: break; case italics: stag_out (em); break; case bold: stag_out (strong); break; } }
Now come the routines that do the output, depending on what mode we happen to find ourselves in. At the start, we're not_decided, since we don't know if the first non-hidden line will be code or documentation.
static enum { not_decided, hiding, doing_code, doing_doc } mode_now, prev_mode; static int comment_depth;
The two check_... routines are called to make sure the line we're working on isn't one of the magic marker lines. If it is magic, we do a mode-switch and drop the line.
static int check_hidden (void) { if (strstr (t_start, "<" "hide" ">")) { prev_mode = mode_now; mode_now = hiding; return (1); } else return (0); } static int hide_code; static int check_doc (void) { char *end_here = strstr (t_start, "<" "doc" ">"); if (end_here) { size_t pre_len = (size_t) (end_here - t_start); if (prefix) free (prefix); prefix = ensure (malloc (1 + pre_len)); if (pre_len) memcpy (prefix, t_start, pre_len); prefix [pre_len] = '\0'; set_face (normal); if (mode_now == doing_code && ! hide_code) etag_out (pre); mode_now = doing_doc; return (1); } else return (0); }
Comments are particularly nasty. Some languages allow them to nest, others don't. Some comments are delimited at both ends, some just extend to the end-of-line marker. And some languages have multiple types of comment marker. Looks like another job for linked lists.
typedef struct comment { char *start; char *end; int nestable; struct comment *next; } comment; static comment *comment_base, *this_comment; static void do_comments (void) { char *end_on = this_comment -> end; char *end_at = strstr (t_start, end_on); if (end_at) end_at += strlen (end_on); if (this_comment -> nestable) { char *start_on = this_comment -> start; char *start_at = strstr (t_start, start_on); if (start_at) { start_at += strlen (start_on); if (end_at == NULL || end_at > start_at) { comment_depth += 2; /* we'll subtract 1 again later ... */ end_at = start_at; } } } if (end_at) { while (t_start < end_at) char_out (*t_start++); comment_depth -= 1; /* told you so! */ push_back (t_start); } else { str_out (t_start); putchar ('\n'); } }
Here's where we check for a start-of-comment. Check the sequence of characters we're just about to output, hoping that it matches one of the "open comment" patterns. If so, emit the line (in italics) if it's a "rest-of-line" style comment, or push back the comment text and start a "block comment". If it's not a comment, return 0 and it'll get output as code.
static int start_comment (void) { comment *p; for (p = comment_base; p; p = p -> next) { size_t len = strlen (p -> start); if (t_end - t_start >= len && memcmp (t_start, p -> start, len) == 0) { set_face (italics); str_out (p -> start); t_start += len; if (p -> end) { comment_depth = 1; this_comment = p; push_back (t_start); } else { str_out (t_start); putchar ('\n'); } return (1); } } return (0); }
The add_comment() routine puts things into the list, according to what gets found in the initialisation file.
static void add_comment (char *start, char *end, int nest) { comment *p = ensure (malloc (sizeof (comment))); size_t len = strlen (start); p -> start = ensure (malloc (len + 1)); strcpy (p -> start, start); if (end) { len = strlen (end); p -> end = ensure (malloc (len + 1)); strcpy (p -> end, end); } else p -> end = 0; p -> nestable = nest; p -> next = comment_base; comment_base = p; }
Output the identifier in the preferred style of spelling: upper-case, lower-case, standardised, or as found in the source file.
static void correct_spelling (int face, char *std, int type) { set_face (face); while (t_start < t_end) { switch (type) { case 'u': putchar (toupper (*t_start)); break; case 'l': putchar (tolower (*t_start)); break; case 's': putchar (*std++); break; default: putchar (*t_start); break; } t_start += 1; } } static int kw_fold, var_fold; static void print_id (void) { char *p = find_keyword (); if (p) correct_spelling (bold, p, kw_fold); else { if (var_fold == 's') {
If we've been asked to standardise variable spellings we've got to keep a list of what we've seen so far. This is going to make things run very slowly if we've got millions of variables defined.
keyword *keep_kw_base = kw_base; kw_base = var_base; p = add_keyword (); var_base = kw_base; kw_base = keep_kw_base; } else p = t_start; correct_spelling (normal, p, var_fold); } }
The processing for a line of code: emit the tokens one by one until told otherwise.
static void do_code_line (void) { int this_type; if (check_hidden () || check_doc () || hide_code) return; if (comment_depth) { do_comments (); return; } while ((this_type = next_token ()) != nul_char) { switch (this_type) { case id_char: print_id (); break; case quote_char: set_face (italics); while (t_start < t_end) char_out (*t_start++); break; case punct_char: if (start_comment ()) return; set_face (bold); char_out (*t_start++); t_end = t_start; break; } } char_out ('\n'); }
The processing for a line of documentation: if we're not ending the doc just print the line, remembering to remove anything it might have in common with the "start doc" line.
static void do_doc_line (void) { if (check_hidden ()) return; if (strstr (t_start, "<" "/doc" ">")) { if (! hide_code) stag_out (pre); mode_now = doing_code; if (comment_depth) set_face (italics); return; } if (prefix) { char *p = prefix; while (*p && *p == *t_start) { p += 1; t_start += 1; } } fputs (t_start, stdout); putchar ('\n'); }
And, of course, hiding a line is simplest of all.
static void do_hiding_biz (void) { if (strstr (t_start, "<" "/hide" ">")) mode_now = prev_mode; }
This is the handler for "don't know" mode. If the line isn't the start of documentation, check to see if there's a token on it. If there is, push back the whole line and read it again as code.
static void decide_then (void) { char *start_here = t_start; if (check_hidden ()) return; if (check_doc ()) { quiet = 0; return; } if (next_token () != nul_char) { quiet = 0; push_back (start_here); mode_now = doing_code; if (! hide_code) stag_out (pre); } }
Are two strings "the same", assuming blanks and case are not significant?
static int matching_strings (char *a, char *b) { for (;;) { while (*a && *a <= ' ') a += 1; while (*b && *b <= ' ') b += 1; if (tolower (*a) != tolower (*b)) return (0); if (! *a) return (1); a += 1; b += 1; } }
Is one string a prefix of the other, in the style of matching_strings() above?
static int is_prefix (char *a, char *b) { for (;;) { while (*a && *a <= ' ') a += 1; while (*b && *b <= ' ') b += 1; if (! *b) return (1); if (tolower (*a) != tolower (*b)) return (0); a += 1; b += 1; } }
The initialisation file contains sections delimited by headings wrapped in [...] characters. For each language I'll be interested in the [<lang> Characters] and [<lang> Keywords] sections. This is where we do the looking.
static void find_section (char *section_name) { char *line; fseek (in_file, 0, SEEK_SET); while ((line = get_line ()) != NULL) if (matching_strings (line, section_name)) return; fprintf (stderr, "Can't find %s section in %s\n", section_name, kw_name); exit (1); }
Here's a horrible routine.
Assuming we're positioned at the right section, read a collection of lines of the form param=val and, if the param is recognised, process the val in some way. The params understood so far are:
Setting any value for variable case or keyword case indicates that the language is not case sensitive.
static void read_setup (void) { char *line; char *equals; while ((line = trim (get_line ())) != NULL) { if (*line == '[') return; equals = strchr (line, '='); if (equals) { static struct kwvals { char *kw; int val; } params [] = { {"quote=", quote_char}, {"quotes=", quote_char}, {"letter=", id_char}, {"letters=", id_char}, {"escape=", escape_char}, {"escapes=", escape_char}, {NULL, 0} }; struct kwvals *p; equals = trim (equals + 1); if (is_prefix (line, "keyword case=")) { switch (kw_fold = tolower (*equals)) { case 's': case 'u': case 'l': break; default: kw_fold = '-'; break; } } else if (is_prefix (line, "variable case=")) { switch (var_fold = tolower (*equals)) { case 's': case 'u': case 'l': break; default: var_fold = '-'; break; } } else if (is_prefix (line, "comment=") || is_prefix (line, "comments=")) { char *end = equals + strlen (equals); char *close_with = equals; while (*close_with > ' ') close_with += 1; if (*close_with == '\0') add_comment (equals, NULL, 0); else { *close_with++ = '\0'; close_with = trim (close_with); end = close_with; while (*end > ' ') end += 1; *end++ = '\0'; add_comment (equals, close_with, *end); } } else for (p = params; p -> kw; p += 1) if (is_prefix (line, p -> kw)) { init_from_string (equals, p -> val); break; } } } }
A marginally less horrible routine. Here we load the [General] section, which defines the HTML tags to use.
static void read_general (void) { char *line; char *equals; while ((line = trim (get_line ())) != NULL) { if (*line == '[') return; equals = strchr (line, '='); if (equals) { static struct kwvals { char *kw; char **val; } params [] = { {"keyword tag=", &strong}, {"comment tag=", &em}, {"code tag=", &pre}, {NULL, NULL} }; struct kwvals *p; equals = trim (equals + 1); for (p = params; p -> kw; p += 1) if (is_prefix (line, p -> kw)) { char **addr = p -> val; if (*addr) free (*addr); *addr = ensure (malloc (1 + strlen (equals))); strcpy (*addr, equals); break; } } } }
This is where we make the decision as to which language we're using and whether we're supressing all but the documentation text.
static int init_language (int argc, char **argv) { if (argc > 1 && strcmp (argv [1], "-d") == 0) { argc -= 1; argv += 1; hide_code = 1; } if (argc != 2) return (0); if ((in_file = fopen ((kw_name = argv [1]), "r")) == NULL) { fprintf (stderr, "Can't open '%s'\n", kw_name); exit (1); } find_section ("[General]"); read_general (); if (! em) em = "em"; if (! pre) pre = "pre"; if (! strong) strong = "strong"; find_section ("[Characters]"); read_setup (); if (kw_fold || var_fold) kwcomp = nocase_cmp; find_section ("[Keywords]"); read_keywords (); fclose (in_file); return (1); }
Given the above, the main routine is quite simple. Do a bit of set-up, initialise according to language, then read stdin and process each line accordingly.
int main (int argc, char **argv) { quiet = 1; init_types (); if (isatty (fileno (stdin)) || ! init_language (argc, argv)) { fprintf (stderr, "Usage: %s [-d] kw-file <source >dest.html\n", argv [0]); exit (1); } set_punct (); in_file = stdin; while ((t_start = t_end = get_line ()) != NULL) { switch (mode_now) { case doing_code: do_code_line (); break; case doing_doc: do_doc_line (); break; case hiding: do_hiding_biz (); break; case not_decided: decide_then (); break; } } if (mode_now == hiding) mode_now = prev_mode; switch (mode_now) { case not_decided: fprintf (stderr, "Nothing useful done\n"); return (1); case doing_code: set_face (normal); if (! hide_code) { etag_out (pre); putchar ('\n'); } default: break; } return (0); }