data
import requestsfrom bs4 import BeautifulSoupimport csvimport pandas as pd# adopt requests Request page to movie box office text = requests.get(url="http://www.piaofang.biz/")text.encoding = text.apparent_encoding#print(text.text)# adopt BeautifulSoup analysis :pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple beautifulsoup4main_page = BeautifulSoup(text.text,"html.parser")#html The parser # find tabletable = main_page.find("table")#f = open(" Movie box office .csv",mode="w",encoding='utf-8',newline="")trs = table.find_all("tr")# Get each one trfor tr in trs: lst = tr.find_all("td")# Find every one of them td if len(lst) != 0: for td in lst: # print(td.text) f.write(td.text.strip().replace(',','').strip().replace('$',''))#strip By default, the left and right margins are removed ,replace Replace commas in data f.write(',') f.write("\n")f.close()
Data analysis and drawing
import numpy as npimport pandas as pdfrom pyecharts import options as optsfrom pyecharts.charts import Pie,Bar,Timeline### Data Extraction data = pd.read_csv(" Movie box office .csv",encoding = 'utf-8',header=None)# print(data)# Then 1,2 Data in data = data.loc[:, [2,3]] # Before the comma is the line , It means that you have to The following representation is column , as long as 3.5 loc[:, 1:4] 1-4 Columns appear # print(data)# Split category data def func1(item): #/ The preceding categories # Pick out the categories separately , Data processing return item.split('/')[0]# Cutting category def func2(item): #/ The following categories # Pick out the categories separately , Data processing if "/" in item: return item.split("/")[1]# Cutting category else: return "lxy"data[6] = data[3].map(func1) # Put the extra categories in one column The following is the original column data[7] = data[3].map(func2)# print(data)# Separate the data data_1 = data.loc[:, [2, 6]]data_2 = data.loc[:, [2, 7]]data_2 = data_2.loc[data_2[7] != 'lxy']# take data_2 All in ’lxy‘ Data deletion data_2 = data_2.rename(columns={
7:6}) # take data_2 The seventh column in is renamed 6, Prevent merging nan# print(data_1)# print(data_2)# data = data_1.append(data_2)data = pd.concat([data_1,data_2])# Merge # print(data)# Calculate the average value of each type of box office data = data.groupby([2,6]).size().reset_index()# Calculate with the sixth column as a group , Cancel year1 Is the default column # Column number rename data.columns = ["year"," type "," Total number of types "]# resp = data[data['year']==1982][[' type ',' Total number of types ']]\# .sort_values(by=' Total number of types ',ascending=True).values.tolist() # Excluding the first column ## print(resp)# Save the processed data # data.to_csv(' year .csv')#### drawing # The time series timeline = Timeline()# Playback settings : Set the time interval 1s 1s=1000mstimeline.add_schema(play_interval=1000)# millisecond for year in data['year'].unique(): resp = ( data[data['year']==year][[' type ',' Total number of types ']] .sort_values(by=' Total number of types ',ascending=False) .values.tolist() )# print(year)# Draw a histogram bar = Bar()#x Axis bar.add_xaxis([x[0] for x in resp])#y Axis bar.add_yaxis('',[x[1] for x in resp])# Let the bar chart be placed horizontally bar.reversal_axis()# Place the counting label on the right side of the graph bar.set_series_opts(label_opts=opts.LabelOpts(position='right'))# Set the name of the icon bar.set_global_opts(title_opts=opts.TitleOpts(title=' Global movies 1982-2021 Type statistics '))# Will set up bar Object is placed in the carousel graph , And the label selects the year timeline.add(bar,f'{year} year ')# Save the set icon as HTMLtimeline.render('year.html')
Why only come out 2021 Years of data , What to add