# 几种统计单词出现次数的方法

##问题定义

##使用二叉查找树实现

``````#include <stdio.h>
#include <string.h>
#include <stdlib.h>

struct tnode  /*  the tree node  */
{
char *word;  /* points to the text*/
int count;   /* number of occurrences */
struct tnode* left; /*  left child */
struct tnode* right; /*  right child */
};

#define MAXWORD 100
struct tnode *addtree(struct tnode *, char *);
void treeprint(struct tnode *);
/*  省略了,可以自己去《C 程序设计语言》上找 */
int getword(char *, int);

int main(int argc, char const* argv[])
{
struct tnode* root;
char word[MAXWORD];

FILE *fp = NULL;
root = NULL;

fp = fopen("source.txt","r");
if ( fp == NULL )
{
printf("文件打开错误");
return 1;
}

while( fscanf(fp, "%s", word) != EOF )
{
root = addtree(root, word);
}

treeprint(root);
return 0;
}

struct tnode *talloc(void);
char *strdup(char *);

struct tnode* addtree(struct tnode *p, char *w)
{
int cond;
if (p == NULL)
{
p = talloc(); /* a new word has arrived */
p->word = strdup(w); /*  make a new node */
p->count = 1;
p->left = p->right = NULL;
}
else if ( (cond = strcmp(w, p->word)) == 0 )
{
p->count++;/*  repeated word */
}
else if (cond < 0)
{
p->left = addtree(p->left, w);
/*  less than into left subtree */
}
else
{
p->right = addtree(p->right, w);
/*  greater than into right subtree */
}
return p;
}

/*  in-order print of tree p */
void treeprint(struct tnode *p)
{
if (p != NULL)
{
treeprint(p->left);
printf("%4d %s\n", p->count, p->word);
treeprint(p->right);
}
}

struct tnode* talloc()
{
return (struct tnode *)malloc(sizeof(struct tnode));
}

char *strdup(char *s)
{
char *p;

p = (char *)malloc(strlen(s) + 1); /*  +1 for '\0' */
if (p != NULL)
{
strcpy(p, s);
}
return p;
}
``````

##使用hash 表实现

``````#include <stdio.h>
#include <string.h>
#include <stdlib.h>

typedef struct node *nodeptr;
typedef struct node
{
char *word;
int count;
nodeptr next;
}node;

#define MAXWORD 100
#define NHASH 29989 /*  the size of hash table */
#define MULT 31
nodeptr bin[NHASH];

/*  calculate the hash value */
unsigned int hash(char *p)
{
unsigned int h = 0;
for (  ; *p	; p++)
{
h = MULT * h + *p;
}
return h % NHASH;
}

void incword(char *s)
{
unsigned int h = hash(s);
nodeptr p;
for (p = bin[h]; p != NULL; p = p->next	)
{
if (strcmp(s, p->word) == 0)
/*  has repeated word */
{
p->count++;
return;
}
}

p = (nodeptr)malloc(sizeof(struct node));
/*  new a node */
p->count = 1;
p->word = (char*)malloc(strlen(s) + 1);
strcpy(p->word, s);
p->next = bin[h];
bin[h] = p;
}

int main(int argc, char const* argv[])
{
int i;
/*  initialized hash table */
for (i = 0; i < NHASH; i++)
{
bin[i] = NULL;
}

char buf[MAXWORD];
FILE *fp = NULL;

fp = fopen("source.txt","r");
if ( fp == NULL )
{
printf("文件打开错误\n");
return 1;
}

while (fscanf(fp, "%s", buf) != EOF)
{/*  incword words */
incword(buf);
}

/*  print words and it's count */
for (i = 0; i < NHASH; i++)
{
nodeptr p;
for ( p = bin[i]; p != NULL; p = p->next)
{
printf("%4d %s\n", p->count, p->word);
}
}
}
``````

##C+＋标准库

``````#include <iostream>
#include <iterator>
#include <map>
#include <string>
#include <fstream>
using namespace std;

int main(int argc, char* argv[])
{
map<string, int> M;
map<string, int>::iterator j;
string t;
ifstream in("source.txt");
while (in >> t)
{
M[t]++;
}

for (j = M.begin(); j != M.end(); ++j)
{
cout << j->second << " " << j->first << endl;
}
return 0;
}
``````

``````#include <algorithm>
#include <iostream>
#include <iterator>
#include <fstream>
#include <string>
#include <map>

using namespace std;

map<string , int> M;
void print( const pair<string, int> &r)
{
cout << r.second << " " << r.first << endl;
}

void record( const string &s)
{
M[ s ] ++;
}

int main()
{
fstream in("source.txt");
istream_iterator<string> it(in);
istream_iterator<string> eos;
for_each(it,eos,record);
for_each( M.begin(), M.end(), print);

return 0;
}
``````

##shell 脚本

``````cat source.txt | tr -cs a-zA-Z '\n' | sort | uniq -c
``````
/
Published under (CC) BY-NC-SA in categories 算法  tagged with algorithm  STL  hash  shell