全文檢索是一種將文件中所有文本與檢索項匹配的文字資料檢索方法。全文檢索系統是按照全文檢索理論建立起來的用於提供全文檢索服務的軟件系統。
[cpp]
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
//Status是函數的類型,其值是函數結果狀態代碼
typedef int Status;
#define INT_MAX 32767
#define LINESIZE 80//設一行字符數為80
#define MAXKEYLEN 26//關鍵字的長度
#define MAXNUM 100//統計單詞的最大數
typedef struct{
char *ch; //關鍵字
int num; //關鍵字的長度
char *info;//關鍵字有關信息
}KeysType; //關鍵字類型
typedef enum{LEAF,BRANCH} NodeKind;//結點種類:{葉子,分支}
typedef struct DLTNode{
char symbol;
struct DLTNode *next; //指向兄弟結點的指針
NodeKind kind; //結點標志
union{
struct DLTNode *first;//分支結點的孩子鏈指針
struct {
int idx; //葉子結點的count數組下標指針
KeysType infoptr;
};
};
}DLTNode,*DLTree; //雙鏈樹類型
char line[LINESIZE];//用於緩存文章中每行的字符串
struct{
int times;
KeysType info;
}count[MAXNUM];
int NUM=0;//記錄整個文章的單詞總數
char ch1[17][100]={"a","an","the","them","there","here","they","are","that","this","those","what","which","why","then","and","these"};
void InitTree(DLTree &root){
//初始化鍵樹
root=new DLTNode;
root->next =NULL;
root->first=NULL;
}//InitTree
Status Insert_DLTree(DLTree &root,KeysType K,int &n){
//指針root所指雙鏈樹中已含n個關鍵字,若不存在和K相同的關鍵字,
//則將關鍵字K插入到雙鏈樹中相應位置,樹中關鍵字個數n增1且返回TRUE;..
//否則不再插入返回FALSE
DLTree p=root->first,f=root,pre,s;
int j=0;
while (p && j<K.num ){//在鍵樹中進行查找
pre=NULL;
while (p && p->symbol <K.ch [j]){//查找和K.ch[j]相同的結點
pre=p;
p=p->next ;
}
if (p && p->symbol ==K.ch[j]){
f=p;
p=p->first ;
j++;
}//找到後進入鍵樹的下一層,即查找和K.ch[j+1]相同的結點
else{//沒有找到和K.ch[j]相同的結點,插入K.ch[j]
s=new DLTNode;
s->kind =BRANCH;
s->symbol =K.ch [j++];
if (pre) pre->next =s;
else f->first =s;
s->next =p;
s->first =NULL;
p=s;
break;
}//else
}//while
if (p&&j==K.num && p->first&&p->first ->kind ==LEAF) return FALSE;
else{//鍵樹中已存在相同前綴的單詞,插入由剩余字符構成的單支樹
while (j<=K.num ){
s=new DLTNode;
s->next =NULL;
if (p){
s->next =p->next ;
p->first =s;
p=s;
}
else {
f->first =s;
p=s;
}
if(j<K.num ){
s->kind =BRANCH;
s->symbol =K.ch [j++];
s->first =NULL;
}
else{
s->symbol ='$';
s->kind =LEAF;
n++;
s->idx =n;
s->infoptr.ch =count[s->idx ].info.ch =K.ch ;
s->infoptr.info =count[s->idx ].info.info =K.info ;
s->infoptr.num =count[s->idx ].info.num=K.num ;
count[s->idx ].times=0;
j++;
}
}//while
return TRUE;
}//else
}//Insert_DLTree
void CreatDLTree(DLTree &T,KeysType *key,int &n){
//建立鍵樹模型
for (int i=0;i<=16;i++){
key[i].ch=ch1[i];
key[i].info=NULL;
key[i].num =strlen(key[i].ch );
}
key[0].info ="indefinite article.";//a
key[1].info ="indefinite articles.";//an
key[2].info ="definite article.";//the
key[3].info ="personal pronoun";//them
key[4].info ="adv. of place and direction";//there
key[5].info ="to this point or place";//here
key[6].info ="pronoun";//they
key[7].info ="v.i. joining subject & predicate";//are
key[8].info ="adj. & pron.";//that
key[9].info ="adj. & pron.";//this
key[10].info ="adj. & pron.";//those
key[11].info ="adj. & pron.,asking for a selection from an indefinite number";//what
key[12].info ="adj. & pron.,asking for a selection from two or a group";//which
key[13].info ="adj. & int.,for what reason,with what purpose";//why
key[14].info ="adv.,at what time";//then
key[15].info ="conj.,connecting words";//and
key[16].info ="adj. & pron.";//these
for(i=0;i<=16;i++)
Insert_DLTree(T,key[i],n);//建立鍵樹模型
}//CreatDLTree
Status Search_DLTree(DLTree rt,int j,int &k){
//若line中從第j個字符起長度為k的子串和指針rt所指向雙鏈樹中單詞相同,
//則數組count中相應分量增1,並返回TRUE,否則返回FALSE
DLTree p;
int found;
k=0;
found=FALSE;
p=rt->first ;//p指向雙鏈樹中的第一棵子樹的樹根
while (p && !found){
while (p && p->symbol <line[j+k]) p=p->next ;
if (!p||p->symbol >line[j+k]) break;//在鍵樹的第k+1層上匹配失敗
else{//繼續匹配
p=p->first ;
k++;
if (p->kind ==LEAF){//找到一個單詞
if (!(line[j+k]>='a'&&line[j+k]<='z')||(line[j+k]>='A'&&line[j+k]<='Z' )){
count[p->idx ].times++;//統計數組對應元素加1
found=TRUE;
}//若鍵樹中含有"commit",文本行中為commit則為找到,若為committing則為沒找到
}//if
}//else
}//while
return found;
}//Search_DLTree
void setmatch(DLTree root,char *line,FILE *f){
//統計以root為根指針的鍵樹中,各關鍵字在本文本串line中重復出現的次數,
//並將其累加到統計數組count中去
unsigned i=0;
int k;//若查找成功,返回的k為所查找的關鍵字長度
while (fgets(line,LINESIZE,f)!=NULL){
cout<<line;
i=0;
while (i<=strlen(line)){//LINESIZE){
if (!Search_DLTree(root,i,k)) {
if (((line[i]>='a' && line[i]<='z')||(line[i]>='A'&&line[i]<='Z')||(line[i]>='0'&&line[i]<='9')) &&!((line[i+1]>='a'&&line[i+1]<='z')||(line[i+1]>='A'&&line[i+1]<='Z')||(line[i+1]>='0'&&line[i+1]<='9')))
NUM++;//單詞總數加1
i++; //若查找不成功,則從下一個字符開始查找
}//if
else {
i+=k; //查找成功,繼續在文本串中的第i+k-1個字符開始查找
NUM++;
}//else
}//while
}//while
}//setmatch
void Input(FILE *&f){
char fname[30];
cout<<"please input the file name:";
cin>>fname; //輸入文件名
if ((f=fopen(fname,"r"))==NULL){//輸入錯誤文件名, 無法打開
cout<<"Can NOT open the file!"<<endl;
exit(1);
}
}//Input
void output(int n){
cout<<endl;
for(int i=1;i<=n;i++)
if (count[i].times){
cout<<count[i].info.ch<<" : ";
if(count[i].info.info) cout<<count[i].info.info<<endl;
cout<<float(count[i].times)/float(NUM)<<" in the sentence."<<endl;
}//輸出文章中含有鍵樹中的關鍵字及有關解釋,出現頻率。即:該關鍵字出現次數/文章單詞總數
cout<<endl;
}
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
//Status是函數的類型,其值是函數結果狀態代碼
typedef int Status;
#define INT_MAX 32767
#define LINESIZE 80//設一行字符數為80
#define MAXKEYLEN 26//關鍵字的長度
#define MAXNUM 100//統計單詞的最大數
typedef struct{
char *ch; //關鍵字
int num; //關鍵字的長度
char *info;//關鍵字有關信息
}KeysType; //關鍵字類型
typedef enum{LEAF,BRANCH} NodeKind;//結點種類:{葉子,分支}
typedef struct DLTNode{
char symbol;
struct DLTNode *next; //指向兄弟結點的指針
NodeKind kind; //結點標志
union{
struct DLTNode *first;//分支結點的孩子鏈指針
struct {
int idx; //葉子結點的count數組下標指針
KeysType infoptr;
};
};
}DLTNode,*DLTree; //雙鏈樹類型
char line[LINESIZE];//用於緩存文章中每行的字符串
struct{
int times;
KeysType info;
}count[MAXNUM];
int NUM=0;//記錄整個文章的單詞總數
char ch1[17][100]={"a","an","the","them","there","here","they","are","that","this","those","what","which","why","then","and","these"};
void InitTree(DLTree &root){
//初始化鍵樹
root=new DLTNode;
root->next =NULL;
root->first=NULL;
}//InitTree
Status Insert_DLTree(DLTree &root,KeysType K,int &n){
//指針root所指雙鏈樹中已含n個關鍵字,若不存在和K相同的關鍵字,
//則將關鍵字K插入到雙鏈樹中相應位置,樹中關鍵字個數n增1且返回TRUE;..
//否則不再插入返回FALSE
DLTree p=root->first,f=root,pre,s;
int j=0;
while (p && j<K.num ){//在鍵樹中進行查找
pre=NULL;
while (p && p->symbol <K.ch [j]){//查找和K.ch[j]相同的結點
pre=p;
p=p->next ;
}
if (p && p->symbol ==K.ch[j]){
f=p;
p=p->first ;
j++;
}//找到後進入鍵樹的下一層,即查找和K.ch[j+1]相同的結點
else{//沒有找到和K.ch[j]相同的結點,插入K.ch[j]
s=new DLTNode;
s->kind =BRANCH;
s->symbol =K.ch [j++];
if (pre) pre->next =s;
else f->first =s;
s->next =p;
s->first =NULL;
p=s;
break;
}//else
}//while
if (p&&j==K.num && p->first&&p->first ->kind ==LEAF) return FALSE;
else{//鍵樹中已存在相同前綴的單詞,插入由剩余字符構成的單支樹
while (j<=K.num ){
s=new DLTNode;
s->next =NULL;
if (p){
s->next =p->next ;
p->first =s;
p=s;
}
else {
f->first =s;
p=s;
}
if(j<K.num ){
s->kind =BRANCH;
s->symbol =K.ch [j++];
s->first =NULL;
}
else{
s->symbol ='$';
s->kind =LEAF;
n++;
s->idx =n;
s->infoptr.ch =count[s->idx ].info.ch =K.ch ;
s->infoptr.info =count[s->idx ].info.info =K.info ;
s->infoptr.num =count[s->idx ].info.num=K.num ;
count[s->idx ].times=0;
j++;
}
}//while
return TRUE;
}//else
}//Insert_DLTree
void CreatDLTree(DLTree &T,KeysType *key,int &n){
//建立鍵樹模型
for (int i=0;i<=16;i++){
key[i].ch=ch1[i];
key[i].info=NULL;
key[i].num =strlen(key[i].ch );
}
key[0].info ="indefinite article.";//a
key[1].info ="indefinite articles.";//an
key[2].info ="definite article.";//the
key[3].info ="personal pronoun";//them
key[4].info ="adv. of place and direction";//there
key[5].info ="to this point or place";//here
key[6].info ="pronoun";//they
key[7].info ="v.i. joining subject & predicate";//are
key[8].info ="adj. & pron.";//that
key[9].info ="adj. & pron.";//this
key[10].info ="adj. & pron.";//those
key[11].info ="adj. & pron.,asking for a selection from an indefinite number";//what
key[12].info ="adj. & pron.,asking for a selection from two or a group";//which
key[13].info ="adj. & int.,for what reason,with what purpose";//why
key[14].info ="adv.,at what time";//then
key[15].info ="conj.,connecting words";//and
key[16].info ="adj. & pron.";//these
for(i=0;i<=16;i++)
Insert_DLTree(T,key[i],n);//建立鍵樹模型
}//CreatDLTree
Status Search_DLTree(DLTree rt,int j,int &k){
//若line中從第j個字符起長度為k的子串和指針rt所指向雙鏈樹中單詞相同,
//則數組count中相應分量增1,並返回TRUE,否則返回FALSE
DLTree p;
int found;
k=0;
found=FALSE;
p=rt->first ;//p指向雙鏈樹中的第一棵子樹的樹根
while (p && !found){
while (p && p->symbol <line[j+k]) p=p->next ;
if (!p||p->symbol >line[j+k]) break;//在鍵樹的第k+1層上匹配失敗
else{//繼續匹配
p=p->first ;
k++;
if (p->kind ==LEAF){//找到一個單詞
if (!(line[j+k]>='a'&&line[j+k]<='z')||(line[j+k]>='A'&&line[j+k]<='Z' )){
count[p->idx ].times++;//統計數組對應元素加1
found=TRUE;
}//若鍵樹中含有"commit",文本行中為commit則為找到,若為committing則為沒找到
}//if
}//else
}//while
return found;
}//Search_DLTree
void setmatch(DLTree root,char *line,FILE *f){
//統計以root為根指針的鍵樹中,各關鍵字在本文本串line中重復出現的次數,
//並將其累加到統計數組count中去
unsigned i=0;
int k;//若查找成功,返回的k為所查找的關鍵字長度
while (fgets(line,LINESIZE,f)!=NULL){
cout<<line;
i=0;
while (i<=strlen(line)){//LINESIZE){
if (!Search_DLTree(root,i,k)) {
if (((line[i]>='a' && line[i]<='z')||(line[i]>='A'&&line[i]<='Z')||(line[i]>='0'&&line[i]<='9')) &&!((line[i+1]>='a'&&line[i+1]<='z')||(line[i+1]>='A'&&line[i+1]<='Z')||(line[i+1]>='0'&&line[i+1]<='9')))
NUM++;//單詞總數加1
i++; //若查找不成功,則從下一個字符開始查找
}//if
else {
i+=k; //查找成功,繼續在文本串中的第i+k-1個字符開始查找
NUM++;
}//else
}//while
}//while
}//setmatch
void Input(FILE *&f){
char fname[30];
cout<<"please input the file name:";
cin>>fname; //輸入文件名
if ((f=fopen(fname,"r"))==NULL){//輸入錯誤文件名, 無法打開
cout<<"Can NOT open the file!"<<endl;
exit(1);
}
}//Input
void output(int n){
cout<<endl;
for(int i=1;i<=n;i++)
if (count[i].times){
cout<<count[i].info.ch<<" : ";
if(count[i].info.info) cout<<count[i].info.info<<endl;
cout<<float(count[i].times)/float(NUM)<<" in the sentence."<<endl;
}//輸出文章中含有鍵樹中的關鍵字及有關解釋,出現頻率。即:該關鍵字出現次數/文章單詞總數
cout<<endl;
}