Jump to: navigation, search

The C Programming Language, 2nd Edition, by Kernighan and Ritchie
Exercise 6.02 on page 143

Write a program that reads a C program and prints in alphabetical order each group of variable names that are identical in the first 6 characters but different somewhere thereafter. Don't count words within strings and comments. Make 6 a parameter that can be set from the command line.



Solution by Akil Adeshwar

/*
   Write a program that reads a C program and prints in alphabetical order each group
   of variable names that are identical in the first 6 characters but different somewhere
   thereafter. Don't count words within strings and comments. Make 6 a parameter that 
   can be set from the command line.
   */

#include<stdio.h>
#include<string.h>
#include<stdlib.h>

#define BUFSIZE 1000
#define MAXLEN 100

#define GROUP_MAX 1000
#define COMP_INDEX_LIMIT_DEFAULT 1

char buf[BUFSIZE];
int bufp = 0;

char *keyword_arr[]  = {"include", "main" ,"return","int","char","void","\0"};
// Contains the list of keyword.
int keyword_count=0;


typedef struct var{
	char word[MAXLEN];
	int count;
	struct var *left;
	struct var *right;
}variable;

// Data structure to hold the variables.



variable *root  = NULL;

/*
 All the variable with at least cmp_index_limit characters,
 which is obtained as cmd line argument(default 6)
 will be in the same group.
*/
variable groups[GROUP_MAX];
int group_count=0;

int cmp_index_limit = COMP_INDEX_LIMIT_DEFAULT;


void copy_var(variable *s,variable *t){

	strcpy(s->word,t->word);
	s->count = t->count;
	s->left = t->left;
	s->right = t->right;

}


variable *add_to_tree(variable *root,variable *p){

	if(root == NULL){
		root = (variable *) malloc(sizeof(variable));
		copy_var(root,p);
	}
	else{
		if(strcmp(p->word,root->word)<0)
			root->left = add_to_tree(root->left,p);
		else if(strcmp(p->word,root->word)>0)
			root->right = add_to_tree(root->right,p);
		else
			root->count++; // Same word occurring again
	}
	return root;
}


variable *add_to_group(variable *p){

	int i=0,inserted_flag=0;
	for(;i<group_count;i++){
		if(strncmp(groups[i].word,p->word,cmp_index_limit)==0){
			add_to_tree(&groups[i],p);
			inserted_flag=1;
		}
	}
	if(!inserted_flag){
		copy_var(&groups[group_count],p);
		group_count++;
	}
}

// Check if find is a keyword

int bin_search_keyword_arr(char find[]){

	int low,high;
	high = keyword_count-1;
	low =0;
	while(low<=high){
		int mid = (low+high)/2;
		//printf("%s -- %s + low: %d high %d mid %d \n",keyword_arr[mid],find,low,high,mid);
		int comp = strcmp(find,keyword_arr[mid]);
		if(comp == 0)
			return mid;
		else if(comp<0)
			high=mid-1;
		else
			low=mid+1;
	}
	return -1;
}

int getch(FILE *fp){
	return (bufp > 0)? buf[--bufp] : fgetc(fp);
}

void ungetch(int c){
	if(bufp >= BUFSIZE)
		printf("\nUngetch: Too many characters");
	else
		buf[bufp++] = c;
}


// getword returns the length of the word.
// Word can begin with an underscore.
int getword(char *word,int lim,FILE *fp){
	
	int c;
	char *w = word;

	while(isspace(c = getch(fp)));
	if(c==EOF)
		return -1;
	
	// Word begin with alpha or _
	if(isalpha(c) || c=='_')
		*w++=c;
	//Remove <*>
	if(c=='<'){
		while(c!='>')
			c = getch(fp);
	}

	//Remove comments
	if(c=='/'){
		c = getch(fp);
		if(c=='/'){
			while(c!='\n' && c!=EOF)
				c = getch(fp); // skip till end of line.
		}
		else if(c=='*'){
			while(1){
				c = getch(fp); 
				if(c == '*'){
					c = getch(fp);
					if(c=='/' || c==EOF)
						break; // break on abrupt end of file.
				}
				if(c == EOF)
					break; // break on abrupt end of file.
			}
		}
	}

	//Remove string constants
	if(c=='"'){
		do{
			c = getch(fp);
		}while(c!='"' && c!=EOF);
	}

	if(!isalpha(c) && c!='_'){
		*w = '\0';
		return w-word;
	}
	for(; --lim>0; w++){
		*w = getch(fp);

		if(!isalnum(*w) && *w!='_'){
			ungetch(*w);
			break;
		}
	}
	*w = '\0';
	return w-word;
}



// For using binary search
void sort_keyword_arr(){
	int i=0;
	char *t;
	for(i=0;i<keyword_count-1;i++){
		if(strcmp(keyword_arr[i],keyword_arr[i+1])>0){
			t = keyword_arr[i];
			keyword_arr[i] = keyword_arr[i+1];
			keyword_arr[i+1] = t;
			i = -1;
		}
	}
}

void sort_groups_arr(){
	int i=0;
	variable t;
	for(i=0;i<group_count-1;i++){
		if(strcmp(groups[i].word,groups[i+1].word)>0){
			t = groups[i];
			groups[i] = groups[i+1];
			groups[i+1] = t;
			i = -1;
		}
	}
}

variable *create_node(char *w){

	variable *a = (variable *) malloc(sizeof(variable));
	strcpy(a->word,w);
	a->count = 1; // Found one already.
	a->left = NULL;
	a->right= NULL;
	return a;
}

void traverse_tree(variable *root){

	if(root!=NULL){
		traverse_tree(root->left);
		printf("%s - Count: %d \n",root->word,root->count);
		traverse_tree(root->right);
	}
}

int main(int argc,char *argv[]){

	char line[MAXLEN];
	FILE *fp = fopen("t2.txt","r"); // Input file with C program
	if(fp!=NULL){

		if(argc>1){
			cmp_index_limit = atoi(argv[1]);
		}
		// Calculate no of keywords
		int i=0;
		while(keyword_arr[i++][0]!='\0')
			keyword_count++;
		// Sort keywords for binary search
		
		sort_keyword_arr();

		// Sort list
		/*for(i=0;i<keyword_count;i++)
			puts(keyword_arr[i]); */


		puts("Results: ");
		while(getword(line,MAXLEN,fp)!=-1){
			if(line[0]!='\0' && bin_search_keyword_arr(line)==-1){	
				// line should not be null and must not be a keyword.
				//puts(line);
				variable *node = create_node(line);
				//puts(node->word);
				add_to_group(node);
			}
		}
		fclose(fp);

		// Sort groups to alphabetical order
		sort_groups_arr();
		for(i=0;i<group_count;i++){
			printf("Group - %d \n",i+1);
			traverse_tree(&groups[i]);
			putchar('\n');
		}
	}
	return 0;
}


Solution by Barrett Drawdy

/*
 * Exercise 6-2 in K&R2 on page 143.  By: Barrett Drawdy
 * Write a program that reads a C program and prints in alphabetical
 * order each group of variable names that are identical in the first
 * 6 characters, but different somewhere thereafter.  Don't count words
 * within strings and comments.  Make 6 a parameter that can be set
 * from the command line.
 */
 
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <limits.h>

#define MAXWORD 1000     /* longest word that can be read by getword */
#define DEFAULT_COMP_LEN 6  /* default length to compare */

/*
 * tnode: Tree node from K&R2 page 140.  Words are initially read into
 * the tree by getword.
 */
struct tnode
{
	char *word;
	int count;
	struct tnode *left;
	struct tnode *right;
};

/*
 * simroot: Part of a linked list of pointers to simword lists with
 * a common root.
 */
struct simroot
{
	struct simword *firstword; /* points to the list of words */
	struct simroot *nextroot;  /* points to the next node in the list */
};

/*
 * simword: Part of a linked list of words with a common root.  Points
 * to the word in the tnodes.
 */
struct simword
{
	char *word;   /* points to the word in the tree */
	int count;    /* copied from the tree */
	struct simword *nextword; /* next node */
};

struct tnode *addtree(struct tnode *, char *);
void treeprint(const struct tnode *);
int getword(char *, int);
struct simroot *addroot(struct simroot *, struct tnode *, int);
struct simroot *parse(struct tnode *, int);
void printlist(struct simroot *, int);
void printwords(struct simword *);

int main(int argc, char *argv[])
{
	struct tnode *root;
	char word[MAXWORD];
	struct simroot *listroot;
	int len;
	
	/* get all the words */
	root = NULL;
	while(getword(word, MAXWORD) != EOF)
		if(isalpha(word[0]))
			root = addtree(root, word);
	
	if(argc == 1)
		len = DEFAULT_COMP_LEN;
	else if(argc == 2)
		len = atoi(argv[1]);
	else {
		printf("Incorrect number of arguments.\n");
		return 1;
	}
	
	/* parse the tree to find similar words and populate list */
	listroot = NULL;
	listroot = parse(root, len);
	treeprint(root);               /* prints all the words */
	printf("\nDuplicate list:\n\n");
	printlist(listroot, len);      /* prints the similar words */
	
	return 0;
} /* end of main() */

/*
 * parse: Reads the tree created by getword.  It finds words that are
 * similar in the first len letters and places them in the list structure
 * that it creates.
 */
struct simroot *parse(struct tnode *node, int len)
{
	struct tnode *temp; /* pointer to the children of the node */
	int this_len;       /* length of the word in the current node */
	static struct simroot *root = NULL; /* points to the root of the tree */
	
	if(node == NULL)
		return NULL;
	
	this_len = strlen(node->word);
	
	parse(node->left, len); /* start in the left subtree */
	
	temp = node->left;      /* find the closest left child and compare */
	if(temp != NULL) {
		while(temp->right != NULL)
			temp = temp->right;
		/* if the word matches, put both words in the list */
		if(this_len >= len && strncmp(temp->word, node->word, len) == 0) {
			root = addroot(root, temp, len);
			addroot(root, node, len);
		}
	}
	
	temp = node->right;    /* find the closest right child and compare */
	if(temp != NULL) {
		while(temp->left != NULL)
			temp = temp->left;
		/* if the word matches, put both words in the list */
		if(this_len >= len && strncmp(temp->word, node->word, len) == 0) {
			root = addroot(root, node, len);
			addroot(root, temp, len);
		}
	}
	
	parse(node->right, len);  /* continue on to right subtree */
	
	return root;
}

/*
 * printlist: Prints the roots that were similar, followed by the list
 * of words.
 */
void printlist(struct simroot *p, int len)
{
	int i;
	if(p == NULL)
		return;
	for(i = 0; i < len; ++i)    /* print the root */
		putchar(p->firstword->word[i]);
	putchar('\n');
	printwords(p->firstword);   /* print the list of words */
	printlist(p->nextroot, len); /* print the next root/list */
}

/*
 * printword: Prints the list of words with the same roots.
 */
void printwords(struct simword *p)
{
	printf("  %4d %s\n", p->count, p->word);
	if(p->nextword != NULL)
		printwords(p->nextword);
}

struct tnode *talloc(void);
char *strdup(char *);
struct simword *walloc(struct simword *, struct tnode *);
void addword(struct simword *, struct tnode *);

/*
 * addroot: When a node n is passed to addroot, n->word is compared to the
 * first word in ps list.  If they have are the same for the first 'len'
 * characters, then the word is added to that list.  Otherwise, it is passed
 * along the simroots until it reaches the end, where a new simroot is 
 * created if the word has a new root.
 */
struct simroot *addroot(struct simroot *p, struct tnode *n, int len)
{
	/* end of list, create a new root */
	if(p == NULL) {
		p = (struct simroot *) malloc(sizeof(struct simroot));
		p->nextroot = NULL;
		p->firstword = walloc(p->firstword, n);
	}
	/* word belongs to this list, add it */
	else if(strncmp(p->firstword->word, n->word, len) == 0)
		addword(p->firstword, n);
	/* haven't found the right root or end yet */
	else
		p->nextroot = addroot(p->nextroot, n, len);
	return p;
}

/*
 * addword: Compares the word from n to the word in p.  If they are the same,
 * the word was already added and it returns.  This continues down the list
 * until the end, where a new node is created if the word is new.
 */
void addword(struct simword *p, struct tnode *n)
{
	/* word was already added */
	if(strcmp(p->word, n->word) == 0)
		return;
	/* end of list. create a new node */
	if(p->nextword == NULL)
		p->nextword = walloc(p->nextword, n);
	/* haven't reached the end yet */
	else
		addword(p->nextword, n);
}

/* walloc: Creates a new simword node. */
struct simword *walloc(struct simword *p, struct tnode *n)
{
	p = (struct simword *) malloc(sizeof(struct simword));
	if(p != NULL) {
		p->word = n->word;
		p->count = n->count;
		p->nextword = NULL;
	}
	return p;
}

/*
 * getword:  Modified from the original on page 136 of K&R2 for exercise
 * 6-1.  Get next word or character from input.
 * Returns EOF, first character of word for strings, or other non-alpha
 * characters.  The function ignores pre-processor directives, ANSI C
 * style comments, and string literals.
 */
int getword(char *word, int lim)
{
	int c, getch(void);
	void ungetch(int);
	char *w = word;
	static int line_beg = 1;    /* 1 at beginning of a new line */
	static int after_slash = 0; /* 1 after '\' */
	int after_star = 0;         /* 1 after '*' */
	
	if(isspace(c = getch()))
		after_slash = 0;
	while(isspace(c)) {
		if(c == '\n')
			line_beg = 1;
		c = getch();
	}
	
	if(c != EOF)
		*w++ = c;
	
	if(c == '#' && line_beg == 1) {            /* preprocessor directive */
		while((c = getch()) != '\n' && c != EOF)    /* go to end of line */
			;
		return getword(word, lim);              /* start over */
	}
	line_beg = 0;
	
	if(c == '\\')                              /* set after_slash flag */
		after_slash = after_slash ? 0 : 1;         /*ignore \\ */
	
	else if(c == '/' ) {
		if((c = getch()) == '*' && !after_slash) {  /* comment beginning */
			while((c = getch()) != EOF) {
				if(c == '/') {
					if(after_star)                 /* end of comment */
						return getword(word, lim); /* start over */
				}
				else if(c == '*' && !after_slash)
					after_star = 1;
				else if(c == '\\')
					after_slash = after_slash ? 0 : 1;    /*ignore \\ */
				else {
					after_star = 0;
					after_slash = 0;
				}
			}
		}                                     /* end comment block */
		
		after_slash = 0;       /* not after slash anymore */
		if(c != EOF)
			ungetch(c);
	}
	
	else if(c == '\"') {
		if(!after_slash) {                  /* string literal begin */
			--w;                            /* reset w */
			while((c = getch()) != EOF) {
				if(c == '\"' && !after_slash)
					break;
				else if(c == '\\')
					after_slash = after_slash ? 0 : 1;    /*ignore \\ */
				else
					after_slash = 0;
				*w++ = c;
			}
			*w = '\0';
			if(c == EOF)
				return EOF;
			else
				return getword(word, lim);            /* start over */
		}
		after_slash = 0;                  /* not after a slash anymore */
	}
	
	if(!isalpha(c) && c != '_') {           /* it's a symbol */
		*w = '\0';
		if(c != '\\')
			after_slash = 0;
		return c;
	}
	
	/* reset this flag since a slash would have just returned */
	after_slash = 0;
	
	for( ; --lim > 0; ++w)                  /* it's a word or letter */
		if(!isalnum(*w = getch()) && *w != '_') {
			ungetch(*w);
			break;
		}
	*w = '\0';
	return word[0];
} /* end getword() */


/***************************************************************************
 *                    All code below here is from K&R2.                    *
 ***************************************************************************/

/*
 * addtree: From K&R2 page 141.
 * Adds a node containing w, at or below node p.
 */
struct tnode *addtree(struct tnode *p, char *w)
{
	int cond;
	
	if(p == NULL) { /* new word */
		p = talloc();
		p->word = strdup(w);
		p->count = 1;
		p->left = p->right = NULL;
	}
	else if((cond = strcmp(w, p->word)) == 0)
		p->count++;
	else if(cond < 0)
		p->left = addtree(p->left, w);
	else
		p->right = addtree(p->right, w);
	return p;
}

/* treeprint: From K&R2 page 142. Prints tree p in-order. */
void treeprint(const struct tnode *p)
{
	if(p != NULL) {
		treeprint(p->left);
		printf("%4d %s\n", p->count, p->word);
		treeprint(p->right);
	}
}

/* talloc: From K&R2 page 142. Makes a tnode. */
struct tnode *talloc(void)
{
	return (struct tnode *) malloc(sizeof(struct tnode));
}

/* strdup: From K&R2 page 143. Makes a duplicate of s. */
char *strdup(char *s)
{
	char *p;
	p = (char *) malloc(strlen(s) + 1);
	if(p != NULL)
		strcpy(p, s);
	return p;
}

/*
 * getch and ungetch are from K&R2, page 79
 */
#define BUFSIZE 100

char buf[BUFSIZE];  /* buffer for ungetch() */
int bufp = 0;       /* next free position in buf */

int getch(void) { /* get a (possibly pushed back) character */
	return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c) {  /* push character back on input */
	if(bufp >= BUFSIZE)
		printf("ungetch: too many characters\n");
	else
		buf[bufp++] = c;
	return;
}



Solution by Alex Hoang (Category 0)


/* This solution takes advantage of the fact the elements of the tree are printed in order. If the 
 * first n chars of a series of strings are similar, then by the nature of the tree, they will be printed
 * in order.  We just have to keep track of when the comparisons (first n chars are equal)  are true, and
 * print the series.  Thus, there is no need to allocate any extra memory to solve this.
 * 
 * We will use a static pointer to the previous node encountered AFTER it reaches the end of recursion for comparison purposes.
 * (This is the node that would have been printed in the original treeprint function.)  We don't print it until
 * we find a true comparison in the next string.  When the comparison is true, the previous word is printed along with the 
 * current word.  Only at the beginning of the series will previous word be printed.
 *
 * If n = 0, then the function will act like a regular treeprint function, and print all of the nodes.
 * 
 * The comparison function mystrcmp will return 1 if the first n chars are equal.  If n = 0, then mystrcmp will
 * return 1 by default.
 *
 * The major changes are below.  Everything else can be found in K&R2.
 */

int mystrcmp(char *, char *, int);

void mytreeprint(struct tnode *p, int n)
{
    static int printprevious = 1; /*flag to print previous node */
    static struct tnode *previous;
    if (p != NULL)
    {
        mytreeprint(p->left, n);
        if (n == 0) /* if n = 0, print all the nodes */
            printf("%4d %s\n", p->count, p->word);
        else
        {
            if (previous != NULL)
            {
                if (mystrcmp(previous->word, p->word, n))  /* returns 1 if first n chars are identical */
                {

                    if (printprevious)
                    {
                        printf("%4d %s\n", previous->count, previous->word);
                        printprevious = 0; /* when previous has been printed, no need to print again until a new series is encountered */
                    }
                    printf("%4d %s\n", p->count, p->word);


                }
                else
                    printprevious = 1; /* if the comparison are no longer true, reset printprevious to get ready for next series */

            }
            previous = p; /* keep track of pointers to previous encountered node for comparison purposes */
        }
        mytreeprint(p->right, n);
    }
}

int mystrcmp(char *s1, char *s2, int n)
{
    int i;
    if (n == 0)
        return 1;
    for (i = 0; *s1 == *s2 && *s1 != '\0' && i < n; i++, *s1++, *s2++)
        ;
    if (i == n)
        return 1;
    return 0;
}


Solution by blob84[blob84@gmail dot com]


/*Exercise 6-2. Write a program that reads a C program and prints in alphabetical order each
group of variable names that are identical in the first 6 characters, but different somewhere
thereafter. Don't count words within strings and comments. Make 6 a parameter that can be set
from the command line.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>

#define MAXWORD 100

struct tnode {    	/* the tree node: */
    char *word;           /* points to the text */
    int match;            /* number of occurrences */
    struct tnode *left;   /* left child */
    struct tnode *right;  /* right child */
};

struct tnode *addtree(struct tnode *, char *, int, int);
void treeprint(struct tnode *);
int getword(char *, int);

/* word frequency count */
int main(int argc, char **argv)
{
    	struct tnode *root;
    	char word[MAXWORD];
	int n = 6;
		
	if (argc == 3)
		if (strcmp("-n", argv[1]) == 0)
			n = atoi(argv[2]);
		else {
			printf("error\n");
			return 1;
		}	
	root = NULL;
	while (getword(word, MAXWORD) != EOF) 
		if (isalpha(word[0])) 
			root = addtree(root, word, 0, n);
	treeprint(root);
    	return 0;
}

struct tnode *talloc(void);
char *mstrdup(char *s);

/* addtree: add a node with w, at or below p */
struct tnode *addtree(struct tnode *p, char *w, int found, int n)
{
    	int cond;
   	
	if (p == NULL) {	/* a new word has arrived */
        	p = talloc();	/* make a new node */
        	p->word = mstrdup(w);
        	p->match = (found) ? 1 : 0;
        	p->left = p->right = NULL;	
	} 
    	else if ((cond = strcmp(w, p->word)) < 0) {  	
		if (strncmp(w, p->word, n) == 0) 
			found = p->match = 1;		
		p->left = addtree(p->left, w, found, n);	
	}  	
	else if (cond > 0) {				
		if (strncmp(w, p->word, n) == 0) 
			found = p->match = 1;		
		p->right = addtree(p->right, w, found, n);
	}
	return p;
}

/* treeprint: in-order print of tree p */
void treeprint(struct tnode *p)
{
	if (p != NULL) {
        	treeprint(p->left);
		if (p->match)
   			printf("%4d %s\n", p->match, p->word);
        	treeprint(p->right);
	}
}

/* getword: get next word or character from input */
#define IN 1
#define OUT 0
int getword (char *word, int lim)
{
	int c, d, getch(void), comment, string, directive;
	void ungetch(int);
	char *w = word;

	comment = string = directive = OUT;	
	while (isspace(c = getch()))
		;	
	if (c == '/') {
		if ((d = getch()) == '*')
			comment = IN;	
		else
			ungetch(d);
	}
	else if (c == '\"')
		string = IN;
	else if (c == '#') {
		if ((d = getch()) != '\'') 
			directive = IN;
		ungetch(d);
	}
	else if (c == '\\')	/* salta il carattere \ */ 
		c = getch();
	if (comment == OUT && string == OUT && directive == OUT) {	
		if (c != EOF) 
			*w++ = c;
		if (!isalnum(c) && c != '_') {
			*w = '\0';
			return c;
		}		
		for (; --lim > 0; w++) 
			if (!isalnum(*w = getch()) && *w != '_') {	
				ungetch(*w);
				break;
			}
		*w = '\0';	//printf("w=%s\n", word);
		return word[0];
	}
	else if (comment == IN) {	/* ignora i commenti */
		*w++ = c;
		*w++ = d;		
		while ((*w++ = c = getch())) {
			if (c == '*') {
				if ((c = getch()) == '/') {
					*w++ = c;					
					comment = OUT;
					break;
				}			
				else 
					ungetch(c);
			}
		}
		*w = '\0';	//printf("w=%s\n", word);
	}
	else if (string == IN) {	/* ignora le stringe */
		*w++ = c;		
		while ((*w++ = getch()) != '\"') 		
			if (*w == '\\')	
				*w++ = getch();	
		string = OUT;	
		*w = '\0';	//printf("w=%s\n", word);
	}
	else if (directive == IN) {
		*w++ = c;
		while ((*w++ = c = getch()) != '\n')
			if (c == '\\')
				*w++ = getch();
		directive = OUT;
		*w = '\0';	//printf("len=%d\nw=%s\n", strlen(word), word);
	}
	return c;
}

#define BUFSIZE 100

char buf[BUFSIZE];	/* buffer per ungetch */
int bufp = 0;		/* prossima posizione libera per buf */

int getch(void)		/* legge un carattere, eventualmente accantonato prima */
{
	return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c)	/* accantona un carattere letto */
{
	if(bufp >= BUFSIZE)
		printf("ungetch: troppi caratteri \n");
	else {
		buf[bufp++] = c;
	}	
}

#include <stdlib.h>
/* talloc: make a tnode */
struct tnode *talloc(void)
{
    	return malloc(sizeof(struct tnode));
}

char *mstrdup(char *s)   /* make a duplicate of s */
{
    	char *p;
    	
	p = malloc(strlen(s)+1); /* +1 for '\0' */
    	if (p != NULL)
        	strcpy(p, s);
    	return p;
}

Solution by anonymous

The book never defined what a variable is (it does discuss limitations of what a variable name can be on page 35, but never defines it fully). Given what I could find on the Internet and in the C17 standard, variables must have one of the 11 type specifiers (void, char, int, float, etc) and functions are not the same as variables. Note: function pointers are still variables since they are a pointer. I based my program on this crude definition. I included the #define preprocessor directive as well since they act like variables within programs even though they aren't technically variables.

I defined the 44 keywords in the C17 standard, added the binsearch so I could see if the word read didn't match them before adding it to the tree, I added in my status type logic from my previous solution, and changed the treeprint function to only print words that match up to the specified number of letters.

Here is my feedback for the other solutions:

  • Akil Adeshwar's solution didn't seem to filter out anything. Plus, it matched anything with the same first character by default instead of 6. Finally, it printed out groups of matches when the group size was only 1 (meaning it had no matches).
  • Barrett Drawdy's solution is an improvement on Akil Adeshwar's, but it printed out matching function names and matching words in comments.
  • Alex Hoang's solution printed out everything that matches, to include comments, functions, and other non-variables.
  • blob84's solution worked on simple tests, but had segmentation faults on more complex input (like blob84's source code file)

Here is my code

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>

/*
    Exercise 6-2. Write a program that reads a C program and prints in alphabetical order each group of variable names that are identical in the first 6 characters, but
    different somewhere thereafter. Don't count words within strings and comments. Make 6 a parameter that can be set from the command line.
*/

struct tnode // the tree node:
{ 
    char *word;             // points to the text
    struct tnode *left;     // left child
    struct tnode *right;    // right child
};

#define MAXWORD 100
#define BUFSIZE 100
#define DEFAULT_NUM_CHARS 6 // default number of chars to compare in a string
#define KEYWORD_LEN 44 // C17 standard has 44 keywords

struct tnode *addtree(struct tnode *, char *);
struct tnode *talloc(void);
void treeprint(struct tnode *, struct tnode *root);
void ungetch(int);
int getword(char *, int);
int binsearch(char *);
int getword(char *word, int lim);
int getch(void);
char *mystrdup(char *); // strdup conflicts with function name from strings.h so it was renamed to mystrdup

enum statusTypes { NORMAL, STRING, SINGLE_LINE_COMMENT, MULTI_LINE_COMMENT, PREPROCESSOR };
enum boolean { FALSE, TRUE };

char *keywords[KEYWORD_LEN] = { // Current 44 keywords as of C17 standard (ISO/IEC 9899:2017). These must be in alphabetical order
 "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "_Static_assert", "_Thread_local", "auto", "break", "case", "char", "const",
 "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short",
 "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"
};
char prevChar = '\0'; // keep track of last char
int type;             // keep track of the current status
char buf[BUFSIZE];    // buffer for ungetch
int bufp = 0;         // next free position in buf
int numChars = DEFAULT_NUM_CHARS; // number of characters to compare in a string to see if it is the same

// word frequency count
int main(int argc, char *argv[])
{
    if (argc <= 2) // if 1 arg (no args given), value will equal 0 after loop. If 2 args, process 2nd arg and value will equal 0 after loop
    {
        while (--argc > 0)
            if (**++argv == '-' && isdigit(*++*argv))
                numChars = atoi(*argv);
            else
                argc = -1; // prints usage
    } 
    if (argc != 0)
    {
        printf("usage: getVars [-#]");
        return 1;
    }
    if (numChars == 0)
    {
        printf("error: zero is not a valid number\n");
        return 1;
    }

    struct tnode *root;
    char word[MAXWORD];
    char toAdd[MAXWORD];
    root = NULL;

    toAdd[0] = '\0'; // set to null string due to logic of while loop. Otherwise, it will print garbage
    while (getword(word, MAXWORD) != EOF)
    {
        if (word[0] == '"')
        {
            if (type == NORMAL && prevChar != '\'')
                type = STRING;
            else if (type == STRING)
                type = NORMAL;
        }
        else if (word[0] == '/')
        {
            if (prevChar == '/' && type == NORMAL)
                type = SINGLE_LINE_COMMENT;
            else if (type == MULTI_LINE_COMMENT && prevChar == '*')
                type = NORMAL;
        }
        else if (word[0] == '\n' && (type == SINGLE_LINE_COMMENT || type == PREPROCESSOR))
            type = NORMAL;
        else if (word[0] == '*' && prevChar == '/' && type == NORMAL)
            type = MULTI_LINE_COMMENT;
        else if (word[0] == '#' && (prevChar == '\n' || prevChar == '\0')) // if first line is a preprocessor, prevChar will be set to '\0'
            type = PREPROCESSOR;

        if (strlen(toAdd) > 0 && word[0] != '(') // if something is in toAdd from previous iteration and the current char is not a '(', then add it since not a function
            root = addtree(root, toAdd);
        toAdd[0] = '\0'; // reset toAdd each iteration
        if (type == NORMAL && (isalpha(word[0]) || word[0] == '_') && binsearch(word) == -1) // if normal word that starts with a letter or underscore and is not a keyword
            strcpy(toAdd, word); // prepare to add it
        if (type == PREPROCESSOR && prevChar == '#' && strcmp(word, "define") == 0) // this turns off the preprocessor status since #define contains a variable name
            type = NORMAL;                                                          // ifdef, ifndef, undef didn't seem appropriate to allow despite using variable names
        if (strlen(word) == 1)
            prevChar = word[0]; // save previous character for status type logic
    }
    if (strlen(toAdd) > 0) // if something is left in toAdd after loop ended, then add it
        root = addtree(root, toAdd);
    treeprint(root, root);
    return 0;
}

// find word in tab[0] ... tab[n-1], tab[] must be in alphabetical order
int binsearch(char *word)
{
    int result, mid, low = 0, high = KEYWORD_LEN - 1;
    while (low <= high)
    {
        mid = (low + high) / 2;
        if ((result = strcmp(word, keywords[mid])) < 0)
            high = mid - 1;
        else if (result > 0)
             low = mid + 1;
        else
            return mid;
    }
    return -1; // not found
}

// add a node with w, at or below p
struct tnode *addtree(struct tnode *p, char *w)
{
    int result;
    if (p == NULL) // a new word has arrived
    {
        p = talloc(); // make a new node
        p->word = mystrdup(w);
        p->left = p->right = NULL;
    }
    else if ((result = strcmp(w, p->word)) < 0) // less than into left subtree
        p->left = addtree(p->left, w);
    else if (result > 0) // greater than into right subtree
        p->right = addtree(p->right, w);
    return p; // repeated word
}

// in-order print of tree p. This won't print the last matching item and must be manually printed 
void treeprint(struct tnode *p, struct tnode *root)
{
    static char prevWord[MAXWORD] = { '\0' }; // set to null
    static int printPrev = FALSE; // used to print prevWord if it matched the previous previous word but does not match current word. This system prevents duplicate prints
    if (p != NULL)
    {
        treeprint(p->left, root); // get all left words in recursive order (lowest left tree node first, then go up)
        if (strncmp(p->word, prevWord, numChars) == 0) // prevWord matches current word, print prevWord and update flag
        {
            printf("%s\n", prevWord);
            printPrev = TRUE;
        }
        else if (printPrev) // if current word does not match prevWord, but prevWord matched the previous prevWord, print it and update flag
        {
            printPrev = FALSE;
            printf("%s\n\n", prevWord); // the extra newline is a separator for each group
        }
        strcpy(prevWord, p->word); // copy current word to prevWord for logic
        treeprint(p->right, root); // get all right words in recursive order (lowest left tree node that has right node, then next lowest left node with a right node)
        if (p == root && printPrev) // last node in recursion, so print last matching word if any
            printf("%s\n", prevWord);
    }
}

struct tnode *talloc(void)
{
    return (struct tnode *) malloc(sizeof(struct tnode));
}

char *mystrdup(char *s) // make a duplicate of s
{
    char *p = (char *) malloc(strlen(s) + 1); // + 1 for '\0'
    if (p != NULL)
        strcpy(p, s);
    return p;
}

// get next word or character from input
int getword(char *word, int lim)
{
    int c;
    char *w = word;
    while ((c = getch()) == '\t' || c == ' ')
        ;
    if (c != EOF)
        *w++ = c;
    if (!isalpha(c) && c != '_')
    {
        *w = '\0';
        return c;
    }
    for ( ; --lim > 0; w++)
        if (!isalnum(*w = getch()) && *w != '_')
        {
            ungetch(*w);
            break;
        }
    *w = '\0';
    return word[0];
}

// get a (possibly pushed back) character
int getch(void)
{
    return (bufp > 0) ? buf[--bufp] : getchar();
}

// push character back on input
void ungetch(int c)
{
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters\n");
    else
        buf[bufp++] = c;
}
Personal tools