您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Python crawler - crawling data to import Excel

編輯：Python

1、 Import third-party library

requests library 、re、html、xlwt

from bs4 import BeautifulSoup # Parse web pages 
import re # Regular expressions , Text matching 
import urllib.request,urllib.error # To develop url, Get web data 
import xlwt # Conduct excel operation 
import sqlite3 # Conduct SQLite Database operation

2、 Request access to web page

def askURL(url):
head = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36"
} # Disguised as a web page , Request Web Information 
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html

url： The URL you want to crawl

User-Agent Access method ：

3、 get data （ Regular expressions ）

# Movie links 
findLink = re.compile(r'<a href="(.*?)">')
# Cover picture 
findImgSrc = re.compile(r'<img.*src="(.*?)".*>',re.S)
# The movie name 
findTitle = re.compile(r'<span class="title">(.*)</span>')
# score 
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# Number of evaluators 
findJudge = re.compile(r'<span>(\d*) People comment on </span>')
# survey 
findInq = re.compile(r'<span class="inq">(.*)</span>')
# Movie details 
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)

By checking the source code of the web page , Find out what you want to get “ Format ”, Use regular expressions to get data
（.*?） Is the data to be obtained , You cannot get without parentheses

4、 establish excel surface

book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet(' Watercress movie Top250',cell_overwrite_ok=True)
book.save(' Watercress movie Top250.xls')

After creation , Import data into excel In the table

【 Complete code 】

from bs4 import BeautifulSoup # Parse web pages 
import re # Regular expressions , Text matching 
import urllib.request,urllib.error # To develop url, Get web data 
import xlwt # Conduct excel operation 
import sqlite3 # Conduct SQLite Database operation 
def main():
baseurl = "https://movie.douban.com/top250?start="
# Crawl to the web 
datalist = getData(baseurl)
# Save the data 
savepath = " Watercress movie Top250.xls"
saveData(datalist,savepath)
# Movie links 
findLink = re.compile(r'<a href="(.*?)">')
# Cover picture 
findImgSrc = re.compile(r'<img.*src="(.*?)".*>',re.S)
# The movie name 
findTitle = re.compile(r'<span class="title">(.*)</span>')
# score 
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# Number of evaluators 
findJudge = re.compile(r'<span>(\d*) People comment on </span>')
# survey 
findInq = re.compile(r'<span class="inq">(.*)</span>')
# Movie details 
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)
# Crawl to the web 
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i*25)
html = askURL(url)
# Parse the data one by one 
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"):
#print(item)
data = []
item = str(item)
Link = re.findall(findLink,item)[0]
data.append(Link)
ImgSrc = re.findall(findImgSrc,item)[0]
data.append(ImgSrc)
Title = re.findall(findTitle,item)
if len(Title)==2:
ctitle = Title[0]
data.append(ctitle)
otitle = Title[1].replace("/","")
data.append(otitle)
else:
data.append(Title[0])
data.append(' ')
Rating = re.findall(findRating,item)[0]
data.append(Rating)
Judge = re.findall(findJudge,item)[0]
data.append(Judge)
Inq = re.findall(findInq,item)
if len(Inq) !=0:
Inq = Inq[0].replace(".","")
data.append(Inq)
else:
data.append(" ")
Bd = re.findall(findBd,item)[0]
Bd = re.sub('<br(\s+)?/>(\s+)?'," ",Bd)
data.append(Bd.strip())
datalist.append(data) # Store the processed movie information in datalist in 
# Parse web pages 
return datalist
# Get and specify a web page content 
def askURL(url):
head = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36"
} # Disguised as a web page , Request Web Information 
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
# Save the data 
def saveData(datalist,savepath):
print("save....")
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet(' Watercress movie Top250',cell_overwrite_ok=True)
col = (" Movie details link "," Cover link "," The Chinese name of the film "," The foreign name of the film "," score "," Evaluation number "," survey "," Related information ","")
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
print(" The first %d strip "%(i+1))
data = datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j])
book.save(' Watercress movie Top250.xls')
main()
print(" Crawling over ")

【 Running results 】