Jump to: navigation, search

The C Programming Language, 2nd Edition, by Kernighan and Ritchie
Exercise 5.18 on page 126

Make dcl recover from input errors.



Solution by Robert Taylor

Main dcl file:

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "getch.h"

#define MAXTOKEN 100
/* Author: Robert Taylor
 * Creation Date: June, 2014
 * Exercise 5-18
 * "Make dcl recover from input errors"
 * If you are doing this exercise for a course, you probably would
 * not be expected to do as much as you will find in this source file.
 *
 * You should be able to copy and paste most declarations from a
 * source file that you might find to the input to this dcl program.
 *
 * Notable limitations include: the reference operator '&' is not 
 * handled, a symbolic constant that is in an array subscript
 * will be seen as an invalid subscript, type modifiers/qualifiers such as
 * static, const, long, short, unsigned, signed etc. are not handled as they
 * will be part of the answer for Exercise 5-20.
 *
 * This dcl can handle multiple declarations on a line such as
 * char *p, line[MAXLEN];
 * dcl can also handle comments, and detects end of declaration indicators
 * such as ; or { or =
 *
 * Recovery from errors is interpreted to mean that despite missing one of
 * [ or ] or ( or ) or NAME, an error message should be displayed and some 
 * educated interpretation should be attempted. In the case of parentheses
 * it is sometimes difficult to know where they should have been so the
 * resulting interpretation will not necessarily be correct.
 *
 * In the case of a missing NAME, instead of outputting an error the name
 * is simply omitted from the output. The following for example:
 * char (*(*())[])()
 * will output:
 * :  function returning pointer to array[] of pointer to function returning char
 */

enum { NAME, PARENS, BRACKETS };

static void dcl(void);
static void dirdcl(void);

static int gettoken(void);
static int tokentype; /* type of last token */
static int oldtoken; /* previous tokentype */
static char token[MAXTOKEN]; /* last token string */
static char name[MAXTOKEN]; /* identifier name */
static char datatype[MAXTOKEN]; /* data type = char, int, etc. */
static char out[1000]; /* output string */
static int parenopen = 0; /* count of open parentheses */
static int parenclose = 0; /* count of unmatched close parentheses */
static int alphaseen = 0; /* track alpha statis for subscript */

int main(void) /* convert declaration to words */
{
	while (gettoken() != EOF) { /* 1st token on line */		
		strcpy(datatype, token); /* pull data type */
		while (tokentype != '\n'){
			out[0] = '\0';
			name[0] = '\0';
			token[0] = '\0';
			parenopen = 0;
			parenclose = 0;
			dcl(); /* parse rest of line */
			if (parenopen > 0) {
				printf("error: missing one or more )\n");
			} 
			if (parenclose > 0) {
				printf("error: missing one or more (\n");
			}
			printf("%s: %s %s\n", name, out, datatype);
		}
	}	
	return 0;
}

/* dcl: parse a declarator */
static void dcl(void)
{
	int ns;
	for (ns = 0; gettoken() == '*'; ) /* count *'s */
		ns++;
	dirdcl();
	while (ns-- > 0)
		strcat(out, " pointer to");
}

/* dirdcl: parse a direct declarator */
static void dirdcl(void)
{
	int tempparens;
	if (tokentype == ','){ /* allow multiple declarators on a line */
		tokentype = '\n';
	}
	if (tokentype == '\n'){
		return;
	}
	if (tokentype == '('){ /* ( dcl ) */
		++parenopen;
		dcl();
		if (tokentype == ')'){
			--parenopen;
		} else { 
			if(oldtoken == '(')
				strcat(out, " function returning");
			ungetch();
		}
	} else if (tokentype == NAME){ /* variable name */
		strcpy(name, token);
	} else if (tokentype == PARENS){
		strcat(out, " function returning");
	} else if (tokentype == BRACKETS){
		strcat(out, " array");
		strcat(out, token);
		strcat(out, " of");
	} else if (oldtoken == NAME && tokentype == ')'){
		++parenclose;
		strcat(out, " function returning");
	} else {
		printf("error: expected name or (dcl)\n");
	}
	gettoken();
	while (tokentype == PARENS || tokentype == BRACKETS 
			|| tokentype == '(' || isdigit(tokentype) ||
			tokentype == NAME){
		if (tokentype == PARENS){
			strcat(out, " function returning");
		} else if (tokentype == BRACKETS) {
			strcat(out, " array");
			strcat(out, token);
			strcat(out, " of");
		} else if (tokentype == '('){
			/* process function with parameters... for now
			 * just ignoring them */
			/* prevents detection of unmatched ) in gettoken() */
			++parenopen; /* don't remove this! */
			tempparens = 1; /* track balanced parentheses */
			strcat(out, " function returning");
			do {
				gettoken();
				if (tokentype == '('){
					++tempparens;
				} else if (tokentype == ')'){
					--tempparens;
				} else if (tokentype == '\n'){
					return;
				}
			} while (tokentype != ')' || tempparens != 0);
			--parenopen;
		} else {
			/* saw NAME or digit: it could be this was part
			 * of a parameter to a function or a subscript
			 * to an array */
			do {
				gettoken();
				if (tokentype == BRACKETS){
					strcat(out, " array");
					strcat(out, token);
					strcat(out, " of");
					break; /* exit do-while */
				}
				if (tokentype == PARENS){
					strcat(out, " function returning");
					break; /* exit do-while */
				}
			} while(tokentype != '\n');
			if (tokentype == '\n')
				return;
		}
		gettoken();
	}
}

/* gettoken: return next token.
 * I have getch() and ungetch() included from the header
 * file getch.h , note that ungetch() in my implementation
 * requires no parameter.
 */
static int gettoken(void)
{
	int c;
	int i;
	char *p = token;
	oldtoken = tokentype; /* back up tokentype */
	while ((c = getch()) == ' ' || c == '\t')
		;
	/* remove comments */
	while (c == '/'){
		c = getch();
		if (c == '/'){
			while (c != '\n')
				c = getch();
		}
		if (c == '*'){ /* start of comment */
			do {
				c = getch();
				if (c == '*'){
					c = getch();
					if(c == '/'){
						c = getch();
						break;
					}
				}
			} while (c != '\n');
		}
	}
	/* Assume anything past one of these characters is something
	 * we do not care to process. */
	if (c == ';' || c == '{' || c == '='){
		while (c != '\n')
			c = getch();
	} 
	/* count unmatched closing parentheses */
	if (c == ')' && parenopen == 0){		
		++parenclose;
		return tokentype = PARENS;
	}
	if (c == '('){
		if ((c = getch()) == ')'){
			strcpy(token, "()");
			return tokentype = PARENS;
		} else {
			ungetch();
			return tokentype = '(';
		}
	} else if (c == '['){ /* we sort of know what should be in here */
		alphaseen = 0;
		for (*p++ = c, i = 0; (c = getch()); ++i){
			while (c == ' ' || c == '\t') /* allow for space */
				c = getch();
			if (i == 0 && isalpha(c)){
				*p++ = c;
				alphaseen = 1;
			} else if (isdigit(c) && alphaseen == 0){
				*p++ = c;
			} else if (i > 0 && isalpha(c)){
				printf("error: invalid array subscript\n");
				while (isalnum(c = getch()))
					;
				ungetch();
			} else if (isdigit(c) && alphaseen){
				printf("error: invalid array subscript\n");
				while (isalnum(c = getch()))
					;
				ungetch();
			} else if (c == ']'){
				*p++ = c;
				break;
			} else {
				printf("error: missing ]\n");
				*p++ = ']';
				ungetch();
				break;
			}
		}
		*p = '\0';
		return tokentype = BRACKETS;
	} else if (c == ']'){ /* unmatched closing bracket? */
		printf("error: missing [\n");
		*p++ = '[';
		*p++ = ']';
		*p = '\0';
		return tokentype = BRACKETS;
	} else if (isalpha(c) || c == '_') { /* an _ is a valid char too */
		for (*p++ = c; isalnum(c = getch()) || c == '_'; )
			*p++ = c;
		*p = '\0';
		ungetch();
		return tokentype = NAME;
	} else {
		return tokentype = c;
	}
}

getch.c file:

#include <stdio.h>
#define BUFSIZE 1000
static char line[BUFSIZE]; /* buffer for line */
static int bufp = 0; /* position in buf */
static int readflag = 1;
static int get_line(char *s, int max_length);
/* getch: read a character */
int getch(void)
{
	int length = 0;
	do {
		if (readflag == 1){
			/* prime the line array */
			if ((length = get_line(line,BUFSIZE)) > 0){
				if(length >= (BUFSIZE - 1)){
					printf("ERROR: line buffer exceeded\n");
					return EOF;
				}
			}else{
				return EOF;
			}
			bufp = 0;
			readflag = 0;
		}
		if (line[bufp] == '\0')
			readflag = 1; /* need to read a new line */	
	} while (readflag);
	return line[bufp++];
}
/* ungetch: push character back on input */
void ungetch(void)
{
	if (bufp > 0)
		--bufp;
}
/*
 * print_source: print out the contents of the line read
 */
void print_source(void)
{
	printf("%s", line);
}
/*
 * getline: 
 * Author: Robert Taylor
 * This version will read a line up to a maximum number of characters
 * (max_length - 1) and store a '\0' at the end. The number of characters
 * read is returned, if that is a 0 then we are done reading lines.
 */
int get_line(char *s, int max_length)
{
	int c;
	char *start = s; /* save pointer to start of buffer s */
	char *end = s + (max_length - 2); /* point near to end of buffer s */
	while((c = getchar()) != EOF && c != '\n' && s < end){
		*s++ = c;	
	}
	/* store last character read */
	if(c != EOF)
		*s++ = c;
	*s = '\0';/* terminate the line */
	return s - start; /* number of characters read */
}

getch.h file:

#ifndef GETCH
#define GETCH
int getch(void);
void ungetch(void);
void print_source(void);
#endif


Solution by codybartfast

Category 0

This is less ambitious than the above solution. It does not attempt to generate fewer errors nor handle more realistic input. It only changes how the program behaves once an error is detected.

Given a list containing declarations:

int dcl()
int dirdcl()
int gettoken()
int tokentype
char token[100]

The code in the book produces the following:

dcl:  function returning int
dirdcl:  function returning int
gettoken:  function returning int
tokentype:  int
token:  array[100] of char

But, if it encounters an error (or just a realistic declaration):

int dcl()
int dirdcl(); /* direct declaration */
int gettoken()
int tokentype
char token[100]

then it doesn't just print an error message, it produces garbage output and steps on the following declaration of gettoken:

dcl:  function returning int
sytax error
dirdcl:  function returning int
sytax error
comment:  pointer to ()
error: expected name or (dcl)
sytax error
comment:  comment
error: expected name or (dcl)
comment:  gettoken
tokentype:  int
token:  array[100] of char

By adding basic error handling just the error message is printed and processing continues on the next line:

dcl:  function returning int
sytax error
gettoken:  function returning int
tokentype:  int
token:  array[100] of char

This is done by having errors bubble up to main which skips the rest of the problematic line:

#include <stdio.h>
#include <string.h>
#include <ctype.h>

#define MAXTOKEN 100
#define BUFSIZE (1 << 10)

enum { NAME, PARENS, BRACKETS };
enum { OK = 0, ERROR };

int dcl(void);
int dirdcl(void);

void nextline(void);
int gettoken(void);
int tokentype;
char token[MAXTOKEN];
char name[MAXTOKEN];
char datatype[MAXTOKEN];
char out[1 << 10];

int getch(void);
void ungetch(int c);
int buf[BUFSIZE];
int bufp = 0;

int main(void)
{
	int rslt;

	while (gettoken() != EOF) {
		strcpy(datatype, token);
		out[0] = '\0';
		if ((rslt = dcl()) != OK)
			nextline();
		else if (tokentype != '\n') {
			printf("sytax error\n");
			nextline();
		} else
			printf("%s: %s %s\n", name, out, datatype);
	}
	return 0;
}

void nextline(void)
{
	int c;

	while ((c = getch()) != '\n' && c != EOF)
		;
	if (c == EOF)
		ungetch(c);
}

int dcl(void)
{
	int ns, rslt;

	for (ns = 0; gettoken() == '*';)
		ns++;
	if ((rslt = dirdcl()) != OK)
		return rslt;
	while (ns-- > 0)
		strcat(out, " pointer to");
	return OK;
}

int dirdcl(void)
{
	int type, rslt;

	if (tokentype == '(') {
		if ((rslt = dcl()) != OK)
			return rslt;
		if (tokentype != ')') {
			printf("error: missing )\n");
			return ERROR;
		}
	} else if (tokentype == NAME)
		strcpy(name, token);
	else {
		printf("error: expected name or (dcl)\n");
		return ERROR;
	}
	while ((type = gettoken()) == PARENS || type == BRACKETS)
		if (type == PARENS)
			strcat(out, " function returning");
		else {
			strcat(out, " array");
			strcat(out, token);
			strcat(out, " of");
		}
	return OK;
}

int gettoken(void)
{
	int c;
	char *p = token;

	while ((c = getch()) == ' ' || c == '\t')
		;
	if (c == '(') {
		if ((c = getch()) == ')') {
			strcpy(token, "()");
			return tokentype = PARENS;
		} else {
			ungetch(c);
			return tokentype = '(';
		}
	} else if (c == '[') {
		for (*p++ = c; (*p++ = getch()) != ']';)
			;
		*p = '\0';
		return tokentype = BRACKETS;
	} else if (isalpha(c)) {
		for (*p++ = c; isalnum(c = getch());)
			*p++ = c;
		*p = '\0';
		ungetch(c);
		return tokentype = NAME;

	} else {
		return tokentype = c;
	}
}

int getch(void)
{
	return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c)
{
	if (bufp >= BUFSIZE)
		printf("ungetch: too many characters\n");
	else
		buf[bufp++] = c;
}

Solution by anonymous

It appears that codybartfast and I had the same idea when it came to this exercise. Before making changes to the program, if one error occurred on a line, usually multiple other errors would trigger and it would be difficult to understand what the true problem was. Plus, the rest of the program wouldn't function correctly even if a valid input was given. So making it recovery from input errors seems as simple as allowing it to function still after an error occurred. Plus, it seemed necessary to reduce the amount of error messages generated and only print the first one since it is the most accurate.

I originally used hard coded ints, but I stole the enum idea from codybartfast idea in my program.

Here is my code

#include <stdio.h>
#include <string.h>
#include <ctype.h>

/*
    Exercise 5-18. Make dcl recover from input errors.
*/

#define MAXTOKEN 100
#define BUFSIZE 100

enum { NAME, PARENS, BRACKETS };
enum { OK, ERROR };

int dcl(void);
int dirdcl(void);
int gettoken(void);
int getch(void);
void ungetch(int c);

int tokentype;              // type of last token
char token[MAXTOKEN];       // last token string
char name[MAXTOKEN];        // identifier name
char datatype[MAXTOKEN];    // data type = char, int, etc.
char out[1000];             // output string
char buf[BUFSIZE];          // buffer for ungetch
int bufp = 0;               // next free position in buf

int main() // convert declaration to words
{
    while (gettoken() != EOF) // 1st token on line
    {
        strcpy(datatype, token); // is the datatype
        out[0] = '\0';
        if(dcl() == OK && tokentype == '\n') // parse rest of line and print out if no error
            printf("%s: %s %s\n", name, out, datatype);
        else // error occurred, skip the rest of the line
        {
            if (tokentype != '\n')
                printf("syntax error\n");
            for (int c = '\0'; c != '\n' && c != EOF; )
                if ((c = getch()) == EOF)
                    ungetch(c);
        }
    }
    return 0;
}

// skips blanks and tabs, then finds the next token in the input; a "token" is a name, a pair of parentheses, a pair of brackets perhaps including a number, or any other
// single character.
int gettoken(void) // return next token
{
    int c;
    char *p = token;

    while ((c = getch()) == ' ' || c == '\t')
        ;
    if (c == '(')
    {
        if ((c = getch()) == ')')
        {
            strcpy(token, "()");
            return tokentype = PARENS;
        }
        else
        {
            ungetch(c);
            return tokentype = '(';
        }
    }
    else if (c == '[')
    {
        for (*p++ = c; (*p++ = getch()) != ']'; )
            ;
        *p = '\0';
        return tokentype = BRACKETS;
    }
    else if (isalpha(c))
    {
        for (*p++ = c; isalnum(c = getch()); )
            *p++ = c;
        *p = '\0';
        ungetch(c);
        return tokentype = NAME;
    }
    else
        return tokentype = c;
}

// parse a declarator
int dcl(void)
{
    int ns;
    for (ns = 0; gettoken() == '*'; ) // count *'s
        ns++;
    if (dirdcl() == ERROR)
        return ERROR;
    while (ns-- > 0)
        strcat(out, " pointer to");
    return OK;
}

// parse a direct declarator
int dirdcl(void)
{
    int type;
    if (tokentype == '(') // ( dcl )
    {
        if (dcl() == ERROR)
            return ERROR;
        if (tokentype != ')')
        {
            printf("error: missing )\n");
            return ERROR;
        }
    }
    else if (tokentype == NAME) // variable name
        strcpy(name, token);
    else
    {
        printf("error: expected name or (dcl)\n");
        return ERROR;
    }
    while ((type = gettoken()) == PARENS || type == BRACKETS)
        if (type == PARENS)
            strcat(out, " function returning");
        else
        {
            strcat(out, " array");
            strcat(out, token);
            strcat(out, " of");
        }
    return OK;
}

// get a (possibly pushed back) character
int getch(void)
{
    return (bufp > 0) ? buf[--bufp] : getchar();
}

// push character back on input
void ungetch(int c)
{
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters\n");
    else
        buf[bufp++] = c;
}
Personal tools