突然想看電影了,就分析了下貓眼電影,但是不知道哪部電影好看,就隨便翻了下,感覺不是很准,然後就批量分析了一下數據,然而結果感覺不是那麼理想,具體實現流程如下,有興趣可以嘗試下
1、進入貓眼電影站點,選擇電影分類,如下圖選擇詳細分類
2、分析網頁內容:打開控制台,選擇一個電影標簽,在控制台可以看到對應的html樣式
def get_html(url):
print("獲取網頁: %s" % url)
# 代理
proxies = [
{
'http': 'http://202.55.5.209:8090'},
{
'http': 'http://183.247.199.114:30001'},
{
'http': 'http://122.9.101.6:8888'},
{
'http': 'http://202.55.5.209:8090'},
]
# 請求頭偽裝
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
"Cookie": "uuid_n_v=v1; uuid=965711E0E8C611EC8A445B788061A84C64DB045EA13840149498F0104B8AF19A; _csrf=231cfce2d54abbd1bf2609ca76cd22ec894318c8223a9e698eb9b798bc2adbd8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1654870020; _lx_utm=utm_source=google&utm_medium=organic; _lxsdk_cuid=1814df090c8c8-02bcc6fa43763b-1d525635-13c680-1814df090c8c8; _lxsdk=965711E0E8C611EC8A445B788061A84C64DB045EA13840149498F0104B8AF19A; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1654870030; __mta=142572173.1654870021186.1654870021186.1654870030627.2; _dd_s=logs=1&id=aa00cee7-93d3-4b37-aa97-0eb2f74923bf&created=1654870020029&expire=1654871326477; _lxsdk_s=1814df090c9-86d-7ca-093||6"
}
resp = requests.get(url, headers=headers, proxies=random.choice(proxies))
if resp.status_code == 200:
return resp.text
return ""
def extract_html(html):
print("數據解析開始")
soup = BS4(html, "lxml")
hover_list = soup.find_all("div", class_="movie-item-hover")
for hover in hover_list:
i_list = []
name = hover.find("span", class_="name").text
score = hover.find("span", class_="score")
if score is None:
score = "無評分"
else:
score = score.text
info_list = hover.find_all("div", class_="movie-hover-title")
i_list.append(name)
i_list.append(score)
num = 0
for info in info_list:
num = num + 1
if num == 1:
continue
i_list.append(str.strip(info.find("span", class_="hover-tag").next_sibling))
with open("data.txt", "a+") as d:
d.writelines(str(i_list) + "\n")
def main():
print("任務開始")
for i in range(20, 40):
print("開始第%d頁" % i)
url = "https://www.maoyan.com/films?showType=3&offset=%d" % (i * 30)
html = get_html(url)
extract_html(html)
print("完成第%d頁" % i)
sleep_time = random.randint(1, 3)
print("休眠%d頁" % sleep_time)
time.sleep(sleep_time)
print("任務完成")
def analysis():
name_list = []
score_list = []
words = ""
with open("data.txt", "r") as f:
num = 0
while 1:
data = f.readline()
if data == "" or num > 1000:
break
data = data.replace("]", "")
data = data.replace("[", "")
data = data.strip("\n")
data = data.replace("'", "")
data = data.replace(" ", "")
data_list = data.split(",")
# name_list.append(data_list[0])
name_list.append(str(num))
score_list.append(float(data_list[1]) if data_list[1] != "無評分" else 0)
num = num + 1
print(data_list[0] + ":" + data_list[2])
words = words + data_list[2] + ","
words = words.replace("/", ",")
# 處理圖表中文亂碼問題
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.figure(figsize=(10, 10.5))
plt.scatter(name_list, score_list, c="red")
plt.xlabel("電影")
plt.ylabel("分數")
plt.title("電影評分")
plt.show()
w = wordcloud.WordCloud(width=1000, height=700, background_color='white', font_path='11.ttf', collocations=False,
scale=1.5)
w.generate(words)
w.to_file('res.png')
1、散點圖
2、詞雲
商業電影網站評分不太准,有點失望,大家認為哪個平台比較准些,可以討論下?