程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> C語言 >> C++ >> C++入門知識 >> 全文檢索模型算法實現

全文檢索模型算法實現

編輯:C++入門知識

 


全文檢索是一種將文件中所有文本與檢索項匹配的文字資料檢索方法。全文檢索系統是按照全文檢索理論建立起來的用於提供全文檢索服務的軟件系統。

 

 

 

 

 


[cpp] 
#include <stdlib.h>  
#include <string.h>  
#include <stdio.h>  
 
//Status是函數的類型,其值是函數結果狀態代碼  
typedef int Status; 
#define INT_MAX 32767  
#define LINESIZE 80//設一行字符數為80  
#define MAXKEYLEN 26//關鍵字的長度  
#define MAXNUM 100//統計單詞的最大數  
typedef struct{    
    char *ch;  //關鍵字  
    int  num;  //關鍵字的長度  
    char *info;//關鍵字有關信息  
}KeysType;     //關鍵字類型  
 
typedef enum{LEAF,BRANCH} NodeKind;//結點種類:{葉子,分支}  
 
typedef struct DLTNode{ 
    char symbol; 
    struct DLTNode *next;     //指向兄弟結點的指針  
    NodeKind kind;            //結點標志  
    union{ 
        struct DLTNode *first;//分支結點的孩子鏈指針  
        struct { 
            int idx;          //葉子結點的count數組下標指針  
            KeysType infoptr; 
        }; 
    }; 
}DLTNode,*DLTree;   //雙鏈樹類型  
char line[LINESIZE];//用於緩存文章中每行的字符串  
struct{ 
    int times; 
    KeysType info; 
}count[MAXNUM]; 
int NUM=0;//記錄整個文章的單詞總數  
char ch1[17][100]={"a","an","the","them","there","here","they","are","that","this","those","what","which","why","then","and","these"}; 
 
void InitTree(DLTree &root){ 
    //初始化鍵樹  
    root=new DLTNode; 
    root->next =NULL; 
    root->first=NULL; 
}//InitTree  
 
Status Insert_DLTree(DLTree &root,KeysType K,int &n){ 
    //指針root所指雙鏈樹中已含n個關鍵字,若不存在和K相同的關鍵字,  
    //則將關鍵字K插入到雙鏈樹中相應位置,樹中關鍵字個數n增1且返回TRUE;..  
    //否則不再插入返回FALSE  
    DLTree p=root->first,f=root,pre,s; 
    int j=0; 
    while (p && j<K.num ){//在鍵樹中進行查找  
        pre=NULL; 
        while (p && p->symbol <K.ch [j]){//查找和K.ch[j]相同的結點  
            pre=p; 
            p=p->next ; 
        } 
        if (p && p->symbol ==K.ch[j]){ 
            f=p; 
            p=p->first ; 
            j++; 
        }//找到後進入鍵樹的下一層,即查找和K.ch[j+1]相同的結點  
        else{//沒有找到和K.ch[j]相同的結點,插入K.ch[j]  
            s=new DLTNode; 
            s->kind =BRANCH; 
            s->symbol =K.ch [j++]; 
            if (pre) pre->next =s; 
            else f->first =s; 
            s->next =p; 
            s->first =NULL; 
            p=s; 
            break; 
        }//else  
    }//while  
    if (p&&j==K.num && p->first&&p->first ->kind ==LEAF) return FALSE; 
    else{//鍵樹中已存在相同前綴的單詞,插入由剩余字符構成的單支樹  
        while (j<=K.num ){ 
                s=new DLTNode; 
                s->next =NULL; 
                if (p){ 
                s->next =p->next ; 
                p->first =s; 
                p=s; 
                } 
                else { 
                    f->first =s; 
                    p=s; 
                } 
                if(j<K.num ){ 
                    s->kind =BRANCH; 
                    s->symbol =K.ch [j++]; 
                    s->first =NULL; 
                } 
                else{ 
                    s->symbol ='$'; 
                    s->kind =LEAF; 
                    n++; 
                    s->idx =n; 
                    s->infoptr.ch =count[s->idx ].info.ch =K.ch ; 
                    s->infoptr.info =count[s->idx ].info.info =K.info ; 
                    s->infoptr.num =count[s->idx ].info.num=K.num ; 
                    count[s->idx ].times=0; 
                    j++; 
                } 
            }//while  
        return TRUE; 
        }//else  
}//Insert_DLTree  
 
 
void CreatDLTree(DLTree &T,KeysType *key,int &n){ 
    //建立鍵樹模型  
    for (int i=0;i<=16;i++){ 
        key[i].ch=ch1[i]; 
        key[i].info=NULL; 
        key[i].num =strlen(key[i].ch ); 
    } 
    key[0].info ="indefinite article.";//a  
    key[1].info ="indefinite articles.";//an  
    key[2].info ="definite article.";//the  
    key[3].info ="personal pronoun";//them  
    key[4].info ="adv. of place and direction";//there  
    key[5].info ="to this point or place";//here  
    key[6].info ="pronoun";//they  
    key[7].info ="v.i. joining subject & predicate";//are  
    key[8].info ="adj. & pron.";//that  
    key[9].info ="adj. & pron.";//this  
    key[10].info ="adj. & pron.";//those  
    key[11].info ="adj. & pron.,asking for a selection from an indefinite number";//what  
    key[12].info ="adj. & pron.,asking for a selection from two or a group";//which  
    key[13].info ="adj. & int.,for what reason,with what purpose";//why  
    key[14].info ="adv.,at what time";//then  
    key[15].info ="conj.,connecting words";//and   
    key[16].info ="adj. & pron.";//these  
    for(i=0;i<=16;i++) 
      Insert_DLTree(T,key[i],n);//建立鍵樹模型  
}//CreatDLTree  
 
Status Search_DLTree(DLTree rt,int j,int &k){ 
    //若line中從第j個字符起長度為k的子串和指針rt所指向雙鏈樹中單詞相同,  
    //則數組count中相應分量增1,並返回TRUE,否則返回FALSE  
    DLTree p; 
    int found; 
    k=0; 
    found=FALSE; 
    p=rt->first ;//p指向雙鏈樹中的第一棵子樹的樹根  
    while (p && !found){ 
        while (p && p->symbol <line[j+k]) p=p->next ; 
        if (!p||p->symbol >line[j+k]) break;//在鍵樹的第k+1層上匹配失敗  
        else{//繼續匹配  
            p=p->first ; 
            k++; 
            if (p->kind ==LEAF){//找到一個單詞  
                if (!(line[j+k]>='a'&&line[j+k]<='z')||(line[j+k]>='A'&&line[j+k]<='Z' )){ 
                count[p->idx ].times++;//統計數組對應元素加1  
                found=TRUE; 
                }//若鍵樹中含有"commit",文本行中為commit則為找到,若為committing則為沒找到  
            }//if  
        }//else  
    }//while  
    return found; 
}//Search_DLTree  
 
void setmatch(DLTree root,char *line,FILE *f){ 
    //統計以root為根指針的鍵樹中,各關鍵字在本文本串line中重復出現的次數,  
    //並將其累加到統計數組count中去  
    unsigned i=0; 
    int k;//若查找成功,返回的k為所查找的關鍵字長度  
    while (fgets(line,LINESIZE,f)!=NULL){ 
        cout<<line; 
        i=0; 
        while (i<=strlen(line)){//LINESIZE){  
            if (!Search_DLTree(root,i,k)) { 
                if (((line[i]>='a' && line[i]<='z')||(line[i]>='A'&&line[i]<='Z')||(line[i]>='0'&&line[i]<='9')) &&!((line[i+1]>='a'&&line[i+1]<='z')||(line[i+1]>='A'&&line[i+1]<='Z')||(line[i+1]>='0'&&line[i+1]<='9'))) 
                    NUM++;//單詞總數加1  
                    i++;  //若查找不成功,則從下一個字符開始查找  
                }//if  
            else { 
                i+=k;     //查找成功,繼續在文本串中的第i+k-1個字符開始查找  
                NUM++; 
            }//else  
        }//while  
    }//while  
}//setmatch  
 
void Input(FILE *&f){ 
    char fname[30]; 
    cout<<"please input the file name:"; 
    cin>>fname;                     //輸入文件名  
    if ((f=fopen(fname,"r"))==NULL){//輸入錯誤文件名, 無法打開  
        cout<<"Can NOT open the file!"<<endl; 
        exit(1); 
    } 
}//Input  
 
void output(int n){ 
    cout<<endl; 
    for(int i=1;i<=n;i++) 
        if (count[i].times){ 
            cout<<count[i].info.ch<<" : "; 
            if(count[i].info.info) cout<<count[i].info.info<<endl; 
            cout<<float(count[i].times)/float(NUM)<<" in the sentence."<<endl; 
        }//輸出文章中含有鍵樹中的關鍵字及有關解釋,出現頻率。即:該關鍵字出現次數/文章單詞總數  
    cout<<endl; 

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

//Status是函數的類型,其值是函數結果狀態代碼
typedef int Status;
#define INT_MAX 32767
#define LINESIZE 80//設一行字符數為80
#define MAXKEYLEN 26//關鍵字的長度
#define MAXNUM 100//統計單詞的最大數
typedef struct{  
 char *ch;  //關鍵字
 int  num;  //關鍵字的長度
 char *info;//關鍵字有關信息
}KeysType;     //關鍵字類型

typedef enum{LEAF,BRANCH} NodeKind;//結點種類:{葉子,分支}

typedef struct DLTNode{
 char symbol;
 struct DLTNode *next;     //指向兄弟結點的指針
 NodeKind kind;            //結點標志
 union{
  struct DLTNode *first;//分支結點的孩子鏈指針
  struct {
   int idx;          //葉子結點的count數組下標指針
   KeysType infoptr;
  };
 };
}DLTNode,*DLTree;   //雙鏈樹類型
char line[LINESIZE];//用於緩存文章中每行的字符串
struct{
 int times;
 KeysType info;
}count[MAXNUM];
int NUM=0;//記錄整個文章的單詞總數
char ch1[17][100]={"a","an","the","them","there","here","they","are","that","this","those","what","which","why","then","and","these"};

void InitTree(DLTree &root){
 //初始化鍵樹
 root=new DLTNode;
 root->next =NULL;
 root->first=NULL;
}//InitTree

Status Insert_DLTree(DLTree &root,KeysType K,int &n){
 //指針root所指雙鏈樹中已含n個關鍵字,若不存在和K相同的關鍵字,
 //則將關鍵字K插入到雙鏈樹中相應位置,樹中關鍵字個數n增1且返回TRUE;..
 //否則不再插入返回FALSE
 DLTree p=root->first,f=root,pre,s;
 int j=0;
    while (p && j<K.num ){//在鍵樹中進行查找
        pre=NULL;
  while (p && p->symbol <K.ch [j]){//查找和K.ch[j]相同的結點
   pre=p;
   p=p->next ;
  }
  if (p && p->symbol ==K.ch[j]){
   f=p;
   p=p->first ;
   j++;
  }//找到後進入鍵樹的下一層,即查找和K.ch[j+1]相同的結點
  else{//沒有找到和K.ch[j]相同的結點,插入K.ch[j]
   s=new DLTNode;
   s->kind =BRANCH;
   s->symbol =K.ch [j++];
   if (pre) pre->next =s;
   else f->first =s;
   s->next =p;
   s->first =NULL;
   p=s;
   break;
  }//else
 }//while
    if (p&&j==K.num && p->first&&p->first ->kind ==LEAF) return FALSE;
 else{//鍵樹中已存在相同前綴的單詞,插入由剩余字符構成的單支樹
  while (j<=K.num ){
    s=new DLTNode;
    s->next =NULL;
    if (p){
    s->next =p->next ;
    p->first =s;
    p=s;
    }
    else {
     f->first =s;
     p=s;
    }
    if(j<K.num ){
     s->kind =BRANCH;
     s->symbol =K.ch [j++];
     s->first =NULL;
    }
    else{
     s->symbol ='$';
     s->kind =LEAF;
     n++;
     s->idx =n;
     s->infoptr.ch =count[s->idx ].info.ch =K.ch ;
     s->infoptr.info =count[s->idx ].info.info =K.info ;
     s->infoptr.num =count[s->idx ].info.num=K.num ;
     count[s->idx ].times=0;
     j++;
    }
   }//while
  return TRUE;
  }//else
}//Insert_DLTree


void CreatDLTree(DLTree &T,KeysType *key,int &n){
 //建立鍵樹模型
 for (int i=0;i<=16;i++){
  key[i].ch=ch1[i];
        key[i].info=NULL;
  key[i].num =strlen(key[i].ch );
 }
 key[0].info ="indefinite article.";//a
 key[1].info ="indefinite articles.";//an
 key[2].info ="definite article.";//the
    key[3].info ="personal pronoun";//them
    key[4].info ="adv. of place and direction";//there
    key[5].info ="to this point or place";//here
    key[6].info ="pronoun";//they
    key[7].info ="v.i. joining subject & predicate";//are
    key[8].info ="adj. & pron.";//that
    key[9].info ="adj. & pron.";//this
    key[10].info ="adj. & pron.";//those
    key[11].info ="adj. & pron.,asking for a selection from an indefinite number";//what
    key[12].info ="adj. & pron.,asking for a selection from two or a group";//which
    key[13].info ="adj. & int.,for what reason,with what purpose";//why
    key[14].info ="adv.,at what time";//then
    key[15].info ="conj.,connecting words";//and
    key[16].info ="adj. & pron.";//these
    for(i=0;i<=16;i++)
      Insert_DLTree(T,key[i],n);//建立鍵樹模型
}//CreatDLTree

Status Search_DLTree(DLTree rt,int j,int &k){
 //若line中從第j個字符起長度為k的子串和指針rt所指向雙鏈樹中單詞相同,
 //則數組count中相應分量增1,並返回TRUE,否則返回FALSE
 DLTree p;
 int found;
 k=0;
 found=FALSE;
 p=rt->first ;//p指向雙鏈樹中的第一棵子樹的樹根
    while (p && !found){
  while (p && p->symbol <line[j+k]) p=p->next ;
  if (!p||p->symbol >line[j+k]) break;//在鍵樹的第k+1層上匹配失敗
  else{//繼續匹配
   p=p->first ;
   k++;
   if (p->kind ==LEAF){//找到一個單詞
    if (!(line[j+k]>='a'&&line[j+k]<='z')||(line[j+k]>='A'&&line[j+k]<='Z' )){
    count[p->idx ].times++;//統計數組對應元素加1
    found=TRUE;
    }//若鍵樹中含有"commit",文本行中為commit則為找到,若為committing則為沒找到
   }//if
  }//else
 }//while
 return found;
}//Search_DLTree

void setmatch(DLTree root,char *line,FILE *f){
 //統計以root為根指針的鍵樹中,各關鍵字在本文本串line中重復出現的次數,
 //並將其累加到統計數組count中去
 unsigned i=0;
 int k;//若查找成功,返回的k為所查找的關鍵字長度
 while (fgets(line,LINESIZE,f)!=NULL){
  cout<<line;
        i=0;
  while (i<=strlen(line)){//LINESIZE){
   if (!Search_DLTree(root,i,k)) {
    if (((line[i]>='a' && line[i]<='z')||(line[i]>='A'&&line[i]<='Z')||(line[i]>='0'&&line[i]<='9')) &&!((line[i+1]>='a'&&line[i+1]<='z')||(line[i+1]>='A'&&line[i+1]<='Z')||(line[i+1]>='0'&&line[i+1]<='9')))
     NUM++;//單詞總數加1
        i++;  //若查找不成功,則從下一個字符開始查找
    }//if
      else {
     i+=k;     //查找成功,繼續在文本串中的第i+k-1個字符開始查找
    NUM++;
   }//else
  }//while
 }//while
}//setmatch

void Input(FILE *&f){
 char fname[30];
 cout<<"please input the file name:";
 cin>>fname;                     //輸入文件名
 if ((f=fopen(fname,"r"))==NULL){//輸入錯誤文件名, 無法打開
  cout<<"Can NOT open the file!"<<endl;
  exit(1);
 }
}//Input

void output(int n){
 cout<<endl;
 for(int i=1;i<=n;i++)
  if (count[i].times){
   cout<<count[i].info.ch<<" : ";
   if(count[i].info.info) cout<<count[i].info.info<<endl;
   cout<<float(count[i].times)/float(NUM)<<" in the sentence."<<endl;
  }//輸出文章中含有鍵樹中的關鍵字及有關解釋,出現頻率。即:該關鍵字出現次數/文章單詞總數
  cout<<endl;
}


 

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved