網頁文本的文本預處理
import re
class Tool:
removeImg = re.compile('<img.*?>| {7}|')
removeAddr = re.compile('<a.*?>|</a>')
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
replaceTD = re.compile('<td>')
replacePara = re.compile('<p.*?>')
replaceBR = re.compile('<br><br>|<br>')
removeExtraTag = re.compile('<.*?>')
removeSpan = re.compile('<span(" ")+</span>')
replaceN = re.compile("(\n)+(\t)?")
special_symbol = re.compile("\u3000\u3000|\xa0|\xa0|\u3000\u3000")
def replace(self, x):
re.sub(self.special_symbol, "", x)
x = re.sub(self.removeImg, "", x)
x = re.sub(self.removeAddr, "", x)
x = re.sub(self.replaceLine, "\n", x)
x = re.sub(self.replaceTD, "\t", x)
x = re.sub(self.replacePara, "\n", x)
x = re.sub(self.replaceBR, "\n", x)
x = re.sub(self.removeExtraTag, "", x)
x = re.sub(self.removeSpan, "", x)
x = re.sub(self.replaceN, "\n", x)
return x.strip()
tool = Tool()
sub_content = tool.replace(s)