The C Programming Language, 2nd Edition, by Kernighan and Ritchie
Exercise 6.01 on page 136
Our version of getword
does not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version.
Solution by Ben Pfaff
/* K&R 6-1: "Our version of getword() does not properly handle underscores, string constants, or preprocessor control lines. Write a better version." This is intended to be a solution to K&R 6-1 in "category 0" as defined by the official rules given on Richard Heathfield's "The C Programming Language Answers To Exercises" page, found at http://users.powernet.co.uk/eton/kandr2/index.html. For more information on the language for which this is a lexical analyzer, please see the comment preceding getword() below. Note that there is a small modification to ungetch() as defined by K&R. Hopefully this lies within the rules. */ /* knr61.c - answer to K&R2 exercise 6-1. Copyright (C) 2000 Ben Pfaff <blp@gnu.org>. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include <ctype.h> #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> /* Tokens. Other non-whitespace characters self-represent themselves as tokens. */ enum token { TOK_ID = UCHAR_MAX + 1, /* Identifier. */ TOK_STRING, /* String constant. */ TOK_CHAR, /* Character constant. */ TOK_EOF /* End of file. */ }; enum token getword (char *word, int lim); static int skipws (void); static int getstelem (char **, int *, int); static int getch (void); static void ungetch (int); static void putch (char **, int *, int); /* Main program for testing. */ int main (void) { ungetch ('\n'); for (;;) { char word[64]; enum token token; /* Get token. */ token = getword (word, sizeof word); /* Print token type. */ switch (token) { case TOK_ID: printf ("id"); break; case TOK_STRING: printf ("string"); break; case TOK_CHAR: printf ("char"); break; case TOK_EOF: printf ("eof\n"); return 0; default: printf ("other"); word[0] = token; word[1] = '\0'; break; } /* Print token value more or less unambiguously. */ { const char *s; printf ("\t'"); for (s = word; *s != '\0'; s++) if (isprint (*s) && *s != '\'') putchar (*s); else if (*s == '\'') printf ("\\'"); else /* Potentially wrong. */ printf ("\\x%02x", *s); printf ("'\n"); } } } /* Parses C-like tokens from stdin: - Parses C identifiers and string and character constants. - Other characters, such as operators, punctuation, and digits not part of identifiers are considered as tokens in themselves. - Skip comments and preprocessor control lines. Does not handle trigraphs, line continuation with \, or numerous other special C features. Returns a token type. This is either one of TOK_* above, or a single character in the range 0...UCHAR_MAX. If TOK_ID, TOK_STRING, or TOK_CHAR is returned, WORD[] is filled with the identifier or string value, truncated at LIM - 1 characters and terminated with '\0'. For other returned token types, WORD[] is indeterminate. */ enum token getword (char *word, int lim) { int beg_line, c; for (;;) { beg_line = skipws (); c = getch (); if (!beg_line || c != '#') break; /* Skip preprocessor directive. */ do { c = getch (); if (c == EOF) return TOK_EOF; } while (c != '\n'); ungetch ('\n'); } if (c == EOF) return TOK_EOF; else if (c == '_' || isalpha ((unsigned char) c)) { do { putch (&word, &lim, c); c = getch (); } while (isalnum ((unsigned char) c) || c == '_'); ungetch (c); return TOK_ID; } else if (c == '\'' || c == '"') { int quote = c; word[0] = '\0'; while (getstelem (&word, &lim, quote)) ; return quote == '\'' ? TOK_CHAR : TOK_STRING; } else return (unsigned char) c; } /* Skips whitespace and comments read from stdin. Returns nonzero if a newline was encountered, indicating that we're at the beginning of a line. */ static int skipws (void) { /* Classification of an input character. */ enum class { CLS_WS = 0, /* Whitespace. */ CLS_BEG_CMT, /* Slash-star beginning a comment. */ CLS_END_CMT, /* Star-slash ending a comment. */ CLS_OTHER, /* None of the above. */ CLS_IN_CMT = 4 /* Combined with one of the above, indicates we're inside a comment. */ }; /* Either 0, if we're not inside a comment, or CLS_IN_CMT, if we are inside a comment. */ enum class in_comment = 0; /* Have we encountered a newline outside a comment? */ int beg_line = 0; for (;;) { int c; /* Input character. */ enum class class; /* Classification of `c'. */ /* Get an input character and determine its classification. */ c = getch (); switch (c) { case '\n': if (!in_comment) beg_line = 1; /* Fall through. */ case ' ': case '\f': case '\r': case '\t': case '\v': class = CLS_WS; break; case '/': /* Outside a comment, slash-star begins a comment. */ if (!in_comment) { c = getch (); if (c == '*') class = CLS_BEG_CMT; else { ungetch (c); c = '/'; class = CLS_OTHER; } } else class = CLS_OTHER; break; case '*': /* Inside a comment, star-slash ends the comment. */ if (in_comment) { c = getch (); if (c == '/') class = CLS_END_CMT; else { ungetch (c); class = CLS_OTHER; } } else class = CLS_OTHER; break; default: /* Other characters. */ if (c == EOF) return 0; class = CLS_OTHER; } /* Handle character `c' according to its classification and whether we're inside a comment. */ switch (class | in_comment) { case CLS_WS: case CLS_WS | CLS_IN_CMT: case CLS_OTHER | CLS_IN_CMT: break; case CLS_BEG_CMT: in_comment = CLS_IN_CMT; break; case CLS_OTHER: ungetch (c); return beg_line; case CLS_END_CMT | CLS_IN_CMT: in_comment = 0; break; case CLS_BEG_CMT | CLS_IN_CMT: case CLS_END_CMT: default: printf ("can't happen\n"); break; } } } /* Get a character inside a quoted string or character constant. QUOTE is ' for a character constant or " for a quoted string. *WORDP points to a string being constructed that has *LIMP bytes available. */ static int getstelem (char **wordp, int *limp, int quote) { int c; /* Handle end-of-quote and EOF. */ c = getch (); if (c == quote || c == EOF) return 0; /* Handle ordinary string characters. */ if (c != '\\') { putch (wordp, limp, c); return 1; } /* We're in a \ escape sequence. Get the second character. */ c = getch (); if (c == EOF) return 0; /* Handle simple single-character escapes. */ { static const char escapes[] = {"''??\"\"\\\\a\ab\bf\fn\nr\rt\tv\v"}; const char *cp = strchr (escapes, c); if (cp != NULL) { putch (wordp, limp, cp[1]); return 1; } } /* Handle hexadecimal and octal escapes. This also handles invalid escapes by default, doing nothing useful with them. That's okay because invalid escapes generate undefined behavior. */ { unsigned char v = 0; if (c == 'x' || c == 'X') for (;;) { static const char hexits[] = "0123456789abcdef"; const char *p; c = getch (); p = strchr (hexits, tolower ((unsigned char) c)); if (p == NULL) break; v = v * 16 + (p - hexits); } else { int i; for (i = 0; i < 3; i++) { v = v * 8 + (c - '0'); c = getch (); if (c < '0' || c > '7') break; } } putch (wordp, limp, v); ungetch (c); } return 1; } /* Capacity of putback buffer. */ #define BUFSIZE 100 /* Putback buffer. */ char buf[BUFSIZE]; /* Number of characters in putback buffer. */ int bufp = 0; /* Retrieves and returns a character from stdin or from the putback buffer. Returns EOF if end of file is encountered. */ int getch (void) { return bufp > 0 ? buf[--bufp] : getchar (); } /* Stuffs character C into the putback buffer. From the caller's perspective, fails silently if the putback buffer is full. */ void ungetch (int c) { if (c == EOF) return; if (bufp >= BUFSIZE) printf ("ungetch: too many characters\n"); else buf[bufp++] = c; } /* Stuffs character C into buffer *WORDP, which has *LIMP bytes available. Advances *WORDP and reduces *LIMP as appropriate. Drops the character on the floor if it would overflow the buffer. Ensures that *WORDP is null terminated if possible. */ static void putch (char **wordp, int *limp, int c) { if (*limp > 1) { *(*wordp)++ = c; (*limp)--; } if (*limp > 0) **wordp = '\0'; } /* Local variables: compile-command: "checkergcc -W -Wall -ansi -pedantic knr61.c -o knr61" End: */
Solution by codybartfast
This uses a function filter_code
to remove preprocessor statements, comments and double quotes. This enables a filtered
function that can then be used like getchar
by an almost unchanged getword
.
Because getch
and ungetch
are used by both getword
and filter_code
, getch
and ungetch
take a struct called stream
as a parameter that comprises the function providing characters (e.g., getchar
or filtered
), the buffer for ungetch, and the buffer index.
Not many edge case are handled, e.g. only single line preprocessor statements are supported.
/* Exercise 6-1 */ #include <ctype.h> #include <stdio.h> #include <string.h> #include "filter-code.h" #define MAXWORD 100 #define NKEYS (sizeof keytab / sizeof(struct key)) struct key { char *word; int count; } keytab[] = { { "auto", 0 }, { "break", 0 }, { "case", 0 }, { "char", 0 }, { "const", 0 }, { "continue", 0 }, { "default", 0 }, { "do", 0 }, { "double", 0 }, { "else", 0 }, { "enum", 0 }, { "extern", 0 }, { "float", 0 }, { "for", 0 }, { "goto", 0 }, { "if", 0 }, { "int", 0 }, { "long", 0 }, { "register", 0 }, { "return", 0 }, { "short", 0 }, { "signed", 0 }, { "sizeof", 0 }, { "static", 0 }, { "struct", 0 }, { "switch", 0 }, { "typedef", 0 }, { "union", 0 }, { "unsigned", 0 }, { "void", 0 }, { "volatile", 0 }, { "while", 0 } }; int filtered(void); int getword(char *, int); int binsearch(char *, struct key *, int); int asalpha(char c); int asalnum(char c); int nkeys; struct filterstate filterstate; int filterbuff[MAXCHBUF]; struct stream filteredstream = { filtered, filterbuff, 0 }; int main(void) { int n; char word[MAXWORD]; nkeys = (sizeof keytab / sizeof(struct key)); while (getword(word, MAXWORD) != EOF) if (isalpha(word[0])) if ((n = binsearch(word, keytab, nkeys)) >= 0) keytab[n].count++; for (n = 0; n < nkeys; n++) if (keytab[n].count > 0) printf("%4d %s\n", keytab[n].count, keytab[n].word); return 0; } /* like getch, but preproc, comments and strings are removed */ int filtered(void) { return filter_code(streamin, filterstate); } int binsearch(char *word, struct key tab[], int n) { int cond; int low, high, mid; low = 0; high = n - 1; while (low <= high) { mid = (low + high) / 2; if ((cond = strcmp(word, tab[mid].word)) < 0) high = mid - 1; else if (cond > 0) low = mid + 1; else return mid; } return -1; } int getword(char *word, int lim) { int c; char *w = word; while (isspace(c = getch(filteredstream))) ; if (c != EOF) *w++ = c; if (!asalpha(c)) { *w = '\0'; return c; } for (; --lim > 0; w++) if (!asalnum(*w = getch(filteredstream))) { ungetch(filteredstream, *w); break; } *w = '\0'; return word[0]; } /* is character we treat As alphabetic */ int asalpha(char c){ return isalpha(c) || c == '_'; } /* is character we treat As alphabetic or is numeric */ int asalnum(char c){ return isalnum(c) || c == '_'; }
filter-code.h
#include "stream.h" struct filterstate { int mode; }; struct filterstate newfilterstate(void); int filter_code(struct stream stream, struct filterstate state);
filter-code.h
#include <stdio.h> #include "filter-code.h" enum filtermode { CODE = 0, PREPROC, COMMENT, DOUBLE}; struct filterstate newfilterstate(void) { struct filterstate s = { CODE }; return s; } int filter_code(struct stream stream, struct filterstate state) { int c = getch(stream); switch (state.mode) { case CODE: switch (c) { case '#': state.mode = PREPROC; return filter_code(stream, state); case '/': if ((c = getch(stream)) == '*') { state.mode = COMMENT; return filter_code(stream, state); } else { ungetch(stream, c); return '/'; } case '"': state.mode = DOUBLE; return filter_code(stream, state); default: return c; } case PREPROC: /* Assumes preproc statements are single line */ switch (c) { case '\n': state.mode = CODE; return filter_code(stream, state); default: return filter_code(stream, state); } case COMMENT: switch (c) { case '*': if ((c = getch(stream)) == '/') { state.mode = CODE; return filter_code(stream, state); } else { ungetch(stream, c); return filter_code(stream, state); } default: return filter_code(stream, state); } case DOUBLE: switch (c) { case '\\': if (getch(stream) == EOF) return EOF; /* ignore escaped char */ return filter_code(stream, state); case '"': state.mode = CODE; return filter_code(stream, state); default: return filter_code(stream, state); } default: printf("error: Unexpected mode %d", state.mode); return EOF; } }
stream.h
#define MAXCHBUF (1 << 1) struct stream streamin; struct stream { int (*getcharacter)(void); int *chbuf; int chbufidx; }; int getch(struct stream stream); void ungetch(struct stream stream, int c);
stream.c
#include <stdio.h> #include "stream.h" static int inbuff[MAXCHBUF]; struct stream streamin = { getchar, inbuff, 0 }; int getch(struct stream stream) { return (stream.chbufidx > 0) ? stream.chbuf[--stream.chbufidx] : stream.getcharacter(); } void ungetch(struct stream stream, int c) { if (stream.chbufidx >= MAXCHBUF) printf("ungetch: too many characters\n"); else { stream.chbuf[stream.chbufidx++] = c; } }
Also on github
Solution by anonymous
I found the C17 standard's full list of keywords and used them in my program. I used a status system that keeps track of what is being read (normal text, string, comment, or preprocessor directive). I then added a lot of if statements to change the status to handle these different type of code. It doesn't handle complex comments or multiline statements using the backslash.
I tried out Ben Pfaff's program, but it seemed to crash since one of the last lines was 'ungetch: too many characters\x0a'. That was even after I increased the buffer size substantially. I also tried out codybartfast program as well, but it crashed when using my test code (there is a bug when there is a double quote char ('"') in the input)
Here is my code
#include <stdio.h> #include <ctype.h> #include <string.h> /* Exercise 6-1. Our version of getword does not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version. */ struct key { char *word; int count; } keytab[] = { // Current keywords as of C17 standard (ISO/IEC 9899:2017). These must be in alphabetical order { "_Alignas", 0 }, { "_Alignof", 0 }, { "_Atomic", 0 }, { "_Bool", 0 }, { "_Complex", 0 }, { "_Generic", 0 }, { "_Imaginary", 0 }, { "_Noreturn", 0 }, { "_Static_assert", 0 }, { "_Thread_local", 0 }, { "auto", 0 }, { "break", 0 }, { "case", 0 }, { "char", 0 }, { "const", 0 }, { "continue", 0 }, { "default", 0 }, { "do", 0 }, { "double", 0 }, { "else", 0 }, { "enum", 0 }, { "extern", 0 }, { "float", 0 }, { "for", 0 }, { "goto", 0 }, { "if", 0 }, { "inline", 0 }, { "int", 0 }, { "long", 0 }, { "register", 0 }, { "restrict", 0 }, { "return", 0 }, { "short", 0 }, { "signed", 0 }, { "sizeof", 0 }, { "static", 0 }, { "struct", 0 }, { "switch", 0 }, { "typedef", 0 }, { "union", 0 }, { "unsigned", 0 }, { "void", 0 }, { "volatile", 0 }, { "while", 0 } }; #define MAXWORD 100 #define NKEYS (int)(sizeof keytab / sizeof keytab[0]) #define BUFSIZE 100 int getword(char *, int); int binsearch(char *, struct key *, int); int getch(void); void ungetch(int); enum statusTypes { NORMAL, STRING, SINGLE_LINE_COMMENT, MULTI_LINE_COMMENT, PREPROCESSOR }; char prevChar; // keep track of last char int type; // keep track of the current status char buf[BUFSIZE]; // buffer for ungetch int bufp = 0; // next free position in buf // count C keywords int main() { int n; char word[MAXWORD]; while (getword(word, MAXWORD) != EOF) { if (word[0] == '"') { if (type == NORMAL && prevChar != '\'') type = STRING; else if (type == STRING) type = NORMAL; } else if (word[0] == '/') { if (prevChar == '/' && type == NORMAL) type = SINGLE_LINE_COMMENT; else if (type == MULTI_LINE_COMMENT && prevChar == '*') type = NORMAL; } else if (word[0] == '\n' && (type == SINGLE_LINE_COMMENT || type == PREPROCESSOR)) type = NORMAL; else if (word[0] == '*' && prevChar == '/' && type == NORMAL) type = MULTI_LINE_COMMENT; else if (word[0] == '#' && prevChar == '\n') type = PREPROCESSOR; if (type == NORMAL && (isalpha(word[0]) || word[0] == '_')) if ((n = binsearch(word, keytab, NKEYS)) >= 0) keytab[n].count++; if (strlen(word) == 1) prevChar = word[0]; } for (n = 0; n < NKEYS; n++) if (keytab[n].count > 0) printf("%4d %s\n", keytab[n].count, keytab[n].word); return 0; } // find word in tab[0] ... tab[n-1], tab[] must be in alphabetical order int binsearch(char *word, struct key tab[], int n) { int result, mid, low = 0, high = n - 1; while (low <= high) { mid = (low + high) / 2; if ((result = strcmp(word, tab[mid].word)) < 0) high = mid - 1; else if (result > 0) low = mid + 1; else return mid; } return -1; } // get next word or character from input int getword(char *word, int lim) { int c; char *w = word; while ((c = getch()) == '\t' || c == ' ') ; if (c != EOF) *w++ = c; if (!isalpha(c) && c != '_') { *w = '\0'; return c; } for ( ; --lim > 0; w++) if (!isalnum(*w = getch()) && *w != '_') { ungetch(*w); break; } *w = '\0'; return word[0]; } // get a (possibly pushed back) character int getch(void) { return (bufp > 0) ? buf[--bufp] : getchar(); } // push character back on input void ungetch(int c) { if (bufp >= BUFSIZE) printf("ungetch: too many characters\n"); else buf[bufp++] = c; }