path = r"C:\Users\lenovo\Desktop\ Thesis and interview \ Customer focus .pdf"
2. open pdf file
pdf_mt = pdfplumber.open(path)
pdf_mt
3. Get the page where the data is located ( How many pages in total )
# Get the page where the data is located list --> [ The object of the first page , The object of the second page ,... The first n The object of the page ]
all_pages = pdf_mt.pages
all_pages
4. obtain pdf Each page of text data ( Text data of the first 40 pages )
for pdf_pg in all_pages[0:40]:
print(pdf_pg.extract_text())
5. Get the contents of the form
for pdf_pg in all_pages[0:40]:
print(pdf_pg.extract_tables())
for pdf_pg in need_pages:
# print(pdf_pg)
# Get the text content of each page
# print(pdf_pg.extract_text())
# Get the contents of the form form : A two-dimensional [[],[]]
# print(pdf_pg.extract_tables())
# The table has two-dimensional data with rows and columns , Get a list of two dimensions
for pdf_tb in pdf_pg.extract_tables():
# print(pdf_tb)
# Write data row by row into the worksheet
for row in pdf_tb:
ws.append(row)
wb.save("demo3.xlsx")