The following gadget contains Judge unicode Whether it's Chinese characters , Numbers , english , Or other characters . Full angle symbol to half angle symbol . unicode String normalization and so on .
#!/usr/bin/env python # -*- coding:GBK -*- """ Tools for Chinese character processing : Judge unicode Whether it's Chinese characters , Numbers , english , Or other characters . Full angle symbol to half angle symbol .""" def is_chinese(uchar): """ Judge a unicode Whether it's Chinese characters """ if uchar >= u'u4e00' and uchar<=u'u9fa5': return True else: return False def is_number(uchar): """ Judge a unicode Is it a number """ if uchar >= u'u0030' and uchar<=u'u0039': return True else: return False def is_alphabet(uchar): """ Judge a unicode Is it an English letter """ if (uchar >= u'u0041' and uchar<=u'u005a') or (uchar >= u'u0061' and uchar<=u'u007a'): return True else: return False def is_other(uchar): """ Judge whether it is not Chinese characters , Numbers and English characters """ if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)): return True else: return False def B2Q(uchar): """ Half to full """ inside_code=ord(uchar) if inside_code<0x0020 or inside_code>0x7e: # If it is not a half angle character, the original character is returned return uchar if inside_code==0x0020: # The formula for full width and half width except for spaces is : Half angle = Full angle -0xfee0 inside_code=0x3000 else: inside_code+=0xfee0 return unichr(inside_code) def Q2B(uchar): """ Full turn half """ inside_code=ord(uchar) if inside_code==0x3000: inside_code=0x0020 else: inside_code-=0xfee0 if inside_code<0x0020 or inside_code>0x7e: # After the conversion, the character that is not a half angle character returns to the original character return uchar return unichr(inside_code) def stringQ2B(ustring): """ Turn the full angle of the string to half angle """ return "".join([Q2B(uchar) for uchar in ustring]) def uniform(ustring): """ Formatted string , Complete full angle to half angle , Capital to lowercase work """ return stringQ2B(ustring).lower() def string2List(ustring): """ take ustring According to Chinese , Letter , Digital separation """ retList=[] utmp=[] for uchar in ustring: if is_other(uchar): if len(utmp)==0: continue else: retList.append("".join(utmp)) utmp=[] else: utmp.append(uchar) if len(utmp)!=0: retList.append("".join(utmp)) return retList if __name__=="__main__": #test Q2B and B2Q for i in range(0x0020,0x007F): print Q2B(B2Q(unichr(i))),B2Q(unichr(i)) #test uniform ustring=u' China The person's name a high frequency A' ustring=uniform(ustring) ret=string2List(ustring) print ret