目录:
- 问题描述
- awk 程序
- c++ 程序
- c 程序
#1. 问题描述
马尔可夫链算法用于生成一段随机的英文,其思想非常简单。首先读入数据,然后将读入的数据分成前缀和后缀两部分,通过前缀来随机获取后缀,籍此产生一段可读的随机英文。
为了说明方便,假设我们有如下一段话:
Show your flowcharts and conceal your tables and I will be mystified. Show your tables and your flowcharts will be obvious.
假设前缀的长度为2,则我们处理输入以后得到如下数据,我们首先获取一个前缀,然后在前缀的后缀列表中随机选择一个单词,然后改变前缀,重复上述过程,这样,我们产生的句子将是可读的。
下面是处理过的数据:
前缀 | 后缀 |
show your | flowcharts tables |
your flowcharts | and will |
flowcharts and | conceal |
flowcharts will | be |
your tables | and and |
will be | mystified. obvious. |
be mystified. | show |
be obvious. | (end) |
处理这个文本的马尔可夫链算法将首先带引show your,然后随机取出flowcharts 或者table 两个单词,假设选择的是flowcharts, 则新的前缀就是your flowcharts,同理,选择table 时,新的前缀就是your table,有了新的前缀your flowcharts 以后,再次随即选择它的后缀,这次是在and 和 will 中随机选择,重复上述过程,就能够产生一段可读的文本。
具体描述如下:
设置 w1 和 w2 为文本的前两个词
输出 w1 和 w2
循环:
随机地选出 w3,它是文本中 w1 w2 的后缀中的一个
打印 w3
把 w1 和 w2 分别换成 w2 和 w3
重复循环
#2.awk 程序
马尔可夫链算法并不难,我们会在后面看到,用C语言来解决这个问题会相当麻烦,而用awk则只需要5分钟就能搞定。这简直就是一个演示awk优点的问题。
awk 中有关联数组,正好可以用来表示前缀和后缀的关系。程序如下:
# markov.awk: markov chain algorithm for 2-word prefixes
BEGIN { MAXGEN = 10000; NONWORD = "\n"; w1 = w2 = NONWORD }
{ for (i = 1; i <= NF; i++) { # read all words
statetab[w1,w2,++nsuffix[w1,w2]] = $i
w1 = w2
w2 = $i
}
}
END {
statetab[w1,w2,++nsuffix[w1,w2]] = NONWORD # add tail
w1 = w2 = NONWORD
for (i = 0; i < MAXGEN; i++) { # generate
r = int(rand()*nsuffix[w1,w2]) + 1 # nsuffix >= 1
p = statetab[w1,w2,r]
if (p == NONWORD)
exit
print p
w1 = w2 # advance chain
w2 = p
}
}
#3. C++ 程序
该问题的主要难点就在于通过前缀随机的获取后缀,在C++ 中,我们可以借助map 来实现前缀和后缀的对应关系,以此得到较高的开发效率。
/* Copyright (C) 1999 Lucent Technologies */
/* Excerpted from 'The Practice of Programming' */
/* by Brian W. Kernighan and Rob Pike */
#include <time.h>
#include <iostream>
#include <string>
#include <deque>
#include <map>
#include <vector>
using namespace std;
const int NPREF = 2;
const char NONWORD[] = "\n"; // cannot appear as real line: we remove newlines
const int MAXGEN = 10000; // maximum words generated
typedef deque<string> Prefix;
map<Prefix, vector<string> > statetab; // prefix -> suffixes
void build(Prefix&, istream&);
void generate(int nwords);
void add(Prefix&, const string&);
// markov main: markov-chain random text generation
int main(void)
{
int nwords = MAXGEN;
Prefix prefix; // current input prefix
srand(time(NULL));
for (int i = 0; i < NPREF; i++)
add(prefix, NONWORD);
build(prefix, cin);
add(prefix, NONWORD);
generate(nwords);
return 0;
}
// build: read input words, build state table
void build(Prefix& prefix, istream& in)
{
string buf;
while (in >> buf)
add(prefix, buf);
}
// add: add word to suffix deque, update prefix
void add(Prefix& prefix, const string& s)
{
if (prefix.size() == NPREF) {
statetab[prefix].push_back(s);
prefix.pop_front();
}
prefix.push_back(s);
}
// generate: produce output, one word per line
void generate(int nwords)
{
Prefix prefix;
int i;
for (i = 0; i < NPREF; i++)
add(prefix, NONWORD);
for (i = 0; i < nwords; i++) {
vector<string>& suf = statetab[prefix];
const string& w = suf[rand() % suf.size()];
if (w == NONWORD)
break;
cout << w << "\n";
prefix.pop_front(); // advance
prefix.push_back(w);
}
}
#4. c 程序
如果需要程序运行得足够快,那就只能用较低级的语言来实现了。当我们用c 语言来实现时,就不得不考虑各种各样的问题了。首先,面临的第一个问题就是,如何表示前缀和后缀的关系?
这里采用前缀的key,后缀为value 的方式存储前缀与后缀的关系,我们知道,hash表的查找速度最快,所以,这里采用hash表也是情理之中的事,只是看你能不能想到,用前缀作key,基于上面的思路,再仔细一点,就没有什么大问题了。
/* Copyright (C) 1999 Lucent Technologies */
/* Excerpted from 'The Practice of Programming' */
/* by Brian W. Kernighan and Rob Pike */
/*
* Markov chain random text generator.
*/
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "eprintf.h"
enum {
NPREF = 2, /* number of prefix words */
NHASH = 4093, /* size of state hash table array */
MAXGEN = 10000 /* maximum words generated */
};
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char *pref[NPREF]; /* prefix words */
Suffix *suf; /* list of suffixes */
State *next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char *word; /* suffix */
Suffix *next; /* next in list of suffixes */
};
State *lookup(char *prefix[], int create);
void build(char *prefix[], FILE*);
void generate(int nwords);
void add(char *prefix[], char *word);
State *statetab[NHASH]; /* hash table of states */
char NONWORD[] = "\n"; /* cannot appear as real word */
/* markov main: markov-chain random text generation */
int main(void)
{
int i, nwords = MAXGEN;
char *prefix[NPREF]; /* current input prefix */
int c;
long seed;
setprogname("markov");
seed = time(NULL);
srand(seed);
for (i = 0; i < NPREF; i++) /* set up initial prefix */
prefix[i] = NONWORD;
build(prefix, stdin);
add(prefix, NONWORD);
generate(nwords);
return 0;
}
const int MULTIPLIER = 31; /* for hash() */
/* hash: compute hash value for array of NPREF strings */
unsigned int hash(char *s[NPREF])
{
unsigned int h;
unsigned char *p;
int i;
h = 0;
for (i = 0; i < NPREF; i++)
for (p = (unsigned char *) s[i]; *p != '\0'; p++)
h = MULTIPLIER * h + *p;
return h % NHASH;
}
/* lookup: search for prefix; create if requested. */
/* returns pointer if present or created; NULL if not. */
/* creation doesn't strdup so strings mustn't change later. */
State* lookup(char *prefix[NPREF], int create)
{
int i, h;
State *sp;
h = hash(prefix);
for (sp = statetab[h]; sp != NULL; sp = sp->next) {
for (i = 0; i < NPREF; i++)
if (strcmp(prefix[i], sp->pref[i]) != 0)
break;
if (i == NPREF) /* found it */
return sp;
}
if (create) {
sp = (State *) emalloc(sizeof(State));
for (i = 0; i < NPREF; i++)
sp->pref[i] = prefix[i];
sp->suf = NULL;
sp->next = statetab[h];
statetab[h] = sp;
}
return sp;
}
/* addsuffix: add to state. suffix must not change later */
void addsuffix(State *sp, char *suffix)
{
Suffix *suf;
suf = (Suffix *) emalloc(sizeof(Suffix));
suf->word = suffix;
suf->next = sp->suf;
sp->suf = suf;
}
/* add: add word to suffix list, update prefix */
void add(char *prefix[NPREF], char *suffix)
{
State *sp;
sp = lookup(prefix, 1); /* create if not found */
addsuffix(sp, suffix);
/* move the words down the prefix */
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = suffix;
}
/* build: read input, build prefix table */
void build(char *prefix[NPREF], FILE *f)
{
char buf[100], fmt[10];
/* create a format string; %s could overflow buf */
sprintf(fmt, "%%%ds", sizeof(buf)-1);
while (fscanf(f, fmt, buf) != EOF)
add(prefix, estrdup(buf));
}
/* generate: produce output, one word per line */
void generate(int nwords)
{
State *sp;
Suffix *suf;
char *prefix[NPREF], *w;
int i, nmatch;
for (i = 0; i < NPREF; i++) /* reset initial prefix */
prefix[i] = NONWORD;
for (i = 0; i < nwords; i++) {
sp = lookup(prefix, 0);
if (sp == NULL)
eprintf("internal error: lookup failed");
nmatch = 0;
for (suf = sp->suf; suf != NULL; suf = suf->next)
if (rand() % ++nmatch == 0) /* prob = 1/nmatch */
w = suf->word;
if (nmatch == 0)
eprintf("internal error: no suffix %d %s", i, prefix[0]);
if (strcmp(w, NONWORD) == 0)
break;
printf("%s\n", w);
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = w;
}
}