The C Programming Language, 2nd Edition, by Kernighan and Ritchie
Exercise 6.03 on page 143
Write a cross-referencer that prints a list of all words in a document, and, for each word, a list of the line numbers on which it occurs. Remove noise words like "the", "and," and so on.
Solution by Richard Heathfield
Bug (noticed by John W Krahn) fixed 11 June 2002. The noise word list
was broken because it contained out-of-order data. I fixed this, and
made the program more generally useful, by performing all string
comparisons without regard to case.
/* Write a cross-referencer program that prints a list of all words in a * document, and, for each word, a list of the line numbers on which it * occurs. Remove noise words like "the", "and," and so on. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> /* no such thing as strdup, so let's write one * * supplementary question: why did I call this function dupstr, * rather than strdup? * */ char *dupstr(char *s) { char *p = NULL; if(s != NULL) { p = malloc(strlen(s) + 1); if(p) { strcpy(p, s); } } return p; } /* case-insensitive string comparison */ int i_strcmp(const char *s, const char *t) { int diff = 0; char cs = 0; char ct = 0; while(diff == 0 && *s != '\0' && *t != '\0') { cs = tolower((unsigned char)*s); ct = tolower((unsigned char)*t); if(cs < ct) { diff = -1; } else if(cs > ct) { diff = 1; } ++s; ++t; } if(diff == 0 && *s != *t) { /* the shorter string comes lexicographically sooner */ if(*s == '\0') { diff = -1; } else { diff = 1; } } return diff; } struct linelist { struct linelist *next; int line; }; struct wordtree { char *word; struct linelist *firstline; struct wordtree *left; struct wordtree *right; }; void printlist(struct linelist *list) { if(list != NULL) { printlist(list->next); printf("%6d ", list->line); } } void printtree(struct wordtree *node) { if(node != NULL) { printtree(node->left); printf("%18s ", node->word); printlist(node->firstline); printf("\n"); printtree(node->right); } } struct linelist *addlink(int line) { struct linelist *new = malloc(sizeof *new); if(new != NULL) { new->line = line; new->next = NULL; } return new; } void deletelist(struct linelist *listnode) { if(listnode != NULL) { deletelist(listnode->next); free(listnode); } } void deleteword(struct wordtree **node) { struct wordtree *temp = NULL; if(node != NULL) { if(*node != '\0') { if((*node)->right != NULL) { temp = *node; deleteword(&temp->right); } if((*node)->left != NULL) { temp = *node; deleteword(&temp->left); } if((*node)->word != NULL) { free((*node)->word); } if((*node)->firstline != NULL) { deletelist((*node)->firstline); } free(*node); *node = NULL; } } } struct wordtree *addword(struct wordtree **node, char *word, int line) { struct wordtree *wordloc = NULL; struct linelist *newline = NULL; struct wordtree *temp = NULL; int diff = 0; if(node != NULL && word != NULL) { if(NULL == *node) { *node = malloc(sizeof **node); if(NULL != *node) { (*node)->left = NULL; (*node)->right = NULL; (*node)->word = dupstr(word); if((*node)->word != NULL) { (*node)->firstline = addlink(line); if((*node)->firstline != NULL) { wordloc = *node; } } } } else { diff = i_strcmp((*node)->word, word); if(0 == diff) { /* we have seen this word before! add this line number to * the front of the line number list. Adding to the end * would keep them in the right order, but would take * longer. By continually adding them to the front, we * take less time, but we pay for it at the end by having * to go to the end of the list and working backwards. * Recursion makes this less painful than it might have been. */ newline = addlink(line); if(newline != NULL) { wordloc = *node; newline->next = (*node)->firstline; (*node)->firstline = newline; } } else if(0 < diff) { temp = *node; wordloc = addword(&temp->left, word, line); } else { temp = *node; wordloc = addword(&temp->right, word, line); } } } if(wordloc == NULL) { deleteword(node); } return wordloc; } /* We can't use strchr because it's not yet been discussed, so we'll * write our own instead. */ char *char_in_string(char *s, int c) { char *p = NULL; /* if there's no data, we'll stop */ if(s != NULL) { if(c != '\0') { while(*s != '\0' && *s != c) { ++s; } if(*s == c) { p = s; } } } return p; } /* We can't use strtok because it hasn't been discussed in the text * yet, so we'll write our own. * To minimise hassle at the user end, let's modify the user's pointer * to s, so that we can just call this thing in a simple loop. */ char *tokenise(char **s, char *delims) { char *p = NULL; char *q = NULL; if(s != NULL && *s != '\0' && delims != NULL) { /* pass over leading delimiters */ while(NULL != char_in_string(delims, **s)) { ++*s; } if(**s != '\0') { q = *s + 1; p = *s; while(*q != '\0' && NULL == char_in_string(delims, *q)) { ++q; } *s = q + (*q != '\0'); *q = '\0'; } } return p; } /* return zero if this word is not a noise word, * or non-zero if it is a noise word */ int NoiseWord(char *s) { int found = 0; int giveup = 0; char *list[] = { "a", "an", "and", "be", "but", "by", "he", "I", "is", "it", "off", "on", "she", "so", "the", "they", "you" }; int top = sizeof list / sizeof list[0] - 1; int bottom = 0; int guess = top / 2; int diff = 0; if(s != NULL) { while(!found && !giveup) { diff = i_strcmp(list[guess], s); if(0 == diff) { found = 1; } else if(0 < diff) { top = guess - 1; } else { bottom = guess + 1; } if(top < bottom) { giveup = 1; } else { guess = (top + bottom) / 2; } } } return found; } /* * Argh! We can't use fgets()! It's not discussed until page 164. * Oh well... time to roll our own again... */ char *GetLine(char *s, int n, FILE *fp) { int c = 0; int done = 0; char *p = s; while(!done && --n > 0 && (c = getc(fp)) != EOF) { if((*p++ = c) == '\n') { done = 1; } } *p = '\0'; if(EOF == c && p == s) { p = NULL; } else { p = s; } return p; } /* * Ideally, we'd use a clever GetLine function which expanded its * buffer dynamically to cope with large lines. Since we can't use * realloc, and because other solutions would require quite hefty * engineering, we'll adopt a simple solution - a big buffer. * * Note: making the buffer static will help matters on some * primitive systems which don't reserve much storage for * automatic variables, and shouldn't break anything anywhere. * */ #define MAXLINE 8192 int main(void) { static char buffer[MAXLINE] = {0}; char *s = NULL; char *word = NULL; int line = 0; int giveup = 0; struct wordtree *tree = NULL; char *delims = " \t\n\r\a\f\v!\"%^&*()_=+{}[]\\|/,.<>:;#~?"; while(!giveup && GetLine(buffer, sizeof buffer, stdin) != NULL) { ++line; s = buffer; while(!giveup && (word = tokenise(&s, delims)) != NULL) { if(!NoiseWord(word)) { if(NULL == addword(&tree, word, line)) { printf("Error adding data into memory. Giving up.\n"); giveup = 1; } } } } if(!giveup) { printf("%18s Line Numbers\n", "Word"); printtree(tree); } deleteword(&tree); return 0; }
Solution by Cromagnon
This is a category 2 solution by Cromagnon (talk) December 26, 2019.
This is a category 2 solution because it considers that the function strdup()
is declared in string.h
. It also removes the function talloc()
.
My version of getword()
returns a pointer to the word, instead of an integer.
If you use this solution, do not forget to link it with the getch and ungetch functions.
It is case insensitive, "The" is like "the".
#include <stdio.h> #include <ctype.h> #include <string.h> #include <stdlib.h> #define MAXLINES 10 /* max # of lines to count for each word*/ struct tnode { char *word; /* pointer to the text */ struct tnode *left; /* left child */ struct tnode *right; /* right child */ unsigned lines[MAXLINES]; /* line numbes */ }; struct tnode *addtree(struct tnode *, char *, int); void treeprint(struct tnode *); char *getword(); int isnoise(char *); /* wf: word frequency count */ int main(void) { struct tnode *root; char *word; unsigned int line = 1; root = NULL; while ((word = getword()) != NULL) if (isalpha(word[0]) && !isnoise(word)) root = addtree(root, word, line); else if (word[0] == '\n') line++; treeprint(root); return 0; } /* isnoise: test if word is noise word */ int isnoise(char * word) { static char *nw[] = { "a", "an", "and", "are", "for", "from", "in", "is", "it", "not", "of", "on", "or", "that", "the", "this", "to", "was", "with", }; int cond; int low, high, mid; if (word[1] == '\0') return 1; low = 0; high = sizeof(nw) / sizeof(char *) - 1; while (low <= high) { mid = (low + high) / 2; if ((cond = strcmp(word, nw[mid])) < 0) high = mid - 1; else if (cond > 0) low = mid + 1; else return 1; } return 0; } /* addtree: add a node with w, at or below p */ struct tnode *addtree(struct tnode *p, char *w, int l) { int cond; int i; /* if a new word has arrived, make a new node */ if (p == NULL) { p = (struct tnode *) malloc(sizeof(struct tnode)); p->word = strdup(w); p->lines[0] = l; for (i = 1; i < MAXLINES; i++) p->lines[i] = 0; p->left = p->right = NULL; } /* repeated word */ else if ((cond = strcmp(w, p->word)) == 0) { for (i = 0; p->lines[i] && i < MAXLINES; i++); ; p->lines[i] = l; } /* less than into left subtree */ else if (cond < 0) p->left = addtree(p->left, w, l); /* greater than into right subtree */ else if (cond > 0) p->right = addtree(p->right, w, l); return p; } /* treeprint: in-order print of tree p */ void treeprint(struct tnode *p) { int i; if (p != NULL) { treeprint(p->left); printf("%-16s", p->word); for (i = 0; i < MAXLINES && p->lines[i]; i++) printf("%s%d", i==0 ? "" : ", ", p->lines[i]); putchar('\n'); treeprint(p->right); } } /* getword: get next word or character from input */ char *getword() { static char word[100]; int c, lim = 100; char *w = word; while (isspace(c = getch()) && c != '\n') ; if (c != EOF) *w++ = tolower(c); if (isalpha(c)) { for ( ; --lim > 1; w++) if (!isalnum(*w = tolower(getch())) && *w != '_') { ungetch(*w); break; } } *w = '\0'; if (c == EOF) return NULL; else return word; }
Solution by anonymous
I found a list of stop words which I used to filter out the noise. I added the option to allow for case-insensitive words or case sensitive depending on the command line arguments. I used a comma separated string to store the list of line numbers that a word appears on and added a helper function to handle that messy process.
Here is my code
#include <stdio.h> #include <ctype.h> #include <string.h> #include <stdlib.h> /* Exercise 6-3. Write a cross-referencer that prints a list of all words in a document, and, for each word, a list of the line numbers on which it occurs. Remove noise words like "the," "and," and so on. */ struct tnode // the tree node: { char *word; // points to the text char *lines; // str that contains line(s) word appears on struct tnode *left; // left child struct tnode *right; // right child }; #define MAXWORD 100 #define BUFSIZE 100 #define MAXLINESTRSIZE 10000 #define MAXDIGITSNUM 50 // used to store sprintf output, Note: unsigned long int max == 20 digits #define STOPWORDLEN 198 // number of stop words struct tnode *addtree(struct tnode *p, char *w); struct tnode *talloc(void); void treeprint(struct tnode *p); void ungetch(int c); int getword(char *, int); int binsearch(char *word); int getword(char *word, int lim); int getch(void); int addLineNumToStr(char *dest); char *mystrdup(char *); enum boolean { FALSE, TRUE }; char *stopWords[STOPWORDLEN] = { // stop words are typically "filler" words in a language. List based on nltk's data set (https://www.nltk.org). Must be in alphabetical order "_", "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "b", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "c", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "e", "each", "f", "few", "for", "from", "further", "g", "h", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "its", "it's", "itself", "j", "just", "k", "l", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "n", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "p", "q", "r", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "shouldn", "shouldn't", "should've", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "u", "under", "until", "up", "v", "ve", "very", "w", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "x", "y", "you", "you'd", "you'll", "your", "you're", "yours", "yourself", "yourselves", "you've", "z" }; char buf[BUFSIZE]; // buffer for ungetch int bufp = 0; // next free position in buf int lineNum = 1; int caseInsensitive = FALSE; // word frequency count int main(int argc, char *argv[]) { if (argc <= 2) // if 1 arg (no args given), value will equal 0 after loop. If 2 args, process 2nd arg and value will equal 0 after loop { while (--argc > 0) if (strcmp(*++argv, "-i") == 0) caseInsensitive = TRUE; else argc = -1; // prints usage } if (argc != 0) { printf("usage: getWords [-i]"); return 1; } struct tnode *root; char word[MAXWORD]; root = NULL; while (getword(word, MAXWORD) != EOF) if ((isalpha(word[0]) || word[0] == '_') && binsearch(word) == -1) // if first char is alpha and not in the stop word list { root = addtree(root, word); if (root == NULL) return 1; } treeprint(root); return 0; } // add a node with w, at or below p struct tnode *addtree(struct tnode *p, char *w) { int result; if (p == NULL) // a new word has arrived { p = talloc(); // make a new node p->word = mystrdup(w); char *str = (char *) malloc(MAXLINESTRSIZE); // get chunk of memory for string str[0] = '\0'; // ensure it is a null string before addLineNumToStr is called if (p != NULL) { if (addLineNumToStr(str) == -1) // if -1, error occurred return NULL; p->lines = str; } else { printf("Error: could not allocate memory for tree node\n"); return p; // p == NULL } p->left = p->right = NULL; } else if ((result = strcmp(w, p->word)) == 0) // same word { if (addLineNumToStr(p->lines) == -1) // if -1, error occurred return NULL; } else if (result < 0) // less than into left subtree { p->left = addtree(p->left, w); if (p->left == NULL) return NULL; } else // greater than into right subtree { p->right = addtree(p->right, w); if (p->right == NULL) return NULL; } return p; } // in-order print of tree p void treeprint(struct tnode *p) { if (p != NULL) { treeprint(p->left); printf("%s ", p->word); printf("%s\n", p->lines); treeprint(p->right); } } struct tnode *talloc(void) { return (struct tnode *) malloc(sizeof(struct tnode)); } char *mystrdup(char *s) // make a duplicate of s { char *p = (char *) malloc(strlen(s) + 1); // + 1 for '\0' if (p != NULL) strcpy(p, s); return p; } // get next word or character from input int getword(char *word, int lim) { int c; char *w = word; while ((c = getch()) == '\t' || c == ' ') ; if (caseInsensitive) c = tolower(c); if (c != EOF) *w++ = c; if (!isalpha(c) && c != '_') { if (word[0] == '\n') lineNum++; *w = '\0'; return c; } for ( ; --lim > 0; w++) { *w = getch(); if (caseInsensitive) *w = tolower(*w); if (!isalnum(*w) && *w != '_') { ungetch(*w); break; } } *w = '\0'; return word[0]; } // get a (possibly pushed back) character int getch(void) { return (bufp > 0) ? buf[--bufp] : getchar(); } // push character back on input void ungetch(int c) { if (bufp >= BUFSIZE) printf("ungetch: too many characters\n"); else buf[bufp++] = c; } // find word in stopWords, stopWords must be in alphabetical order int binsearch(char *word) { int result, mid, low = 0, high = STOPWORDLEN - 1; while (low <= high) { mid = (low + high) / 2; result = stricmp(word, stopWords[mid]); // case-insensitive comparison to see if matches stop word if (result < 0) high = mid - 1; else if (result > 0) low = mid + 1; else return mid; } return -1; // not found } int addLineNumToStr(char *dest) { char numStr[MAXDIGITSNUM]; int digits = 0, n = lineNum; while (n != 0) { n /= 10; digits++; } int len = strlen(dest); if (len == 0) { sprintf(numStr, "%d", lineNum); // sprintf is the a part of the C standard library, but itoa is not (although widely implemented). Using sprintf from now on if (digits + 1 <= MAXLINESTRSIZE) // digits + 1 because of '\0' strcat(dest, numStr); else { printf("Error: not enough memory to add new line string to tree node\n"); return -1; } } else { sprintf(numStr, ", %d", lineNum); int same = TRUE; for (int i = 0; i < digits; i++) if (numStr[i] != dest[len - digits + i]) { same = FALSE; break; } if (!same) { if (len + digits + 3 <= MAXLINESTRSIZE) // digits + 3 because 1 for '\0', 1 for ',' and 1 for ' ' strcat(dest, numStr); else { printf("Error: not enough memory to add new line string to tree node\n"); return -1; } } } return 0; }