# !/usr/bin/env python
# -*-coding:utf-8 -*-
import os.path
import bs4,shutil,time
from pandas.core.frame import DataFrame
def get_html_tabledata(htmlpath,tableindex: int = 0):
"""
html文件,獲取表格數據
:param htmlpath: html文件路徑
:param tableindex: table索引,int,默認為0
:return:字典列表
"""
with open(htmlpath, 'r+',encoding='UTF-8') as f:
s = f.read()
wb = s.strip().replace('\ufeff', '')
soup = bs4.BeautifulSoup(wb, 'lxml') # 解析html
# 獲取指定表格的數據
table=soup.findAll("table")[tableindex] # 讀取第二個表格
table_rows = table.findAll("tr") # 獲得表格中行的集合
# 獲取表格第一行作為字典keykey
keys = [table_rows[0].findAll(['th', 'td'])[i].getText().strip() for i in range(len(table_rows[0].findAll(['th', 'td']))) ]
tabledata = []
for table_row in table_rows[1:]:
row = table_row.findAll(['th', 'td']) # 獲取th/td標簽
linedata = {keys[i]: row[i].getText().strip() for i in range(len(row))} # 每行數據按字段返回:鍵值對
tabledata.append(linedata)
# print(tabledata)
return tabledata
def html_to_excel(htmlpath,excelpath,tableindex: int = 0):
"""html文件,將指定表格數據保存到excel文件"""
tabledata = get_html_tabledata(htmlpath,tableindex)
data = DataFrame(tabledata) # 將字典列表轉為表格樣式
# print(data,len(data),len(data.columns)) # 獲取行數:len(df);獲取列數:len(df.columns)
# 寫入excel
data.to_excel(excelpath, index=False, header=True) # 輸出為表,不帶列號,輸出文件名
if __name__ == '__main__':
htmlpath = r'C:\Users\yhen\Downloads\2022-06-17T13_51_06+0800.html'
tabledata = get_html_tabledata(htmlpath,1)
tabledata = sorted(tabledata,key=lambda x:x['關鍵字'])
print(tabledata)