選擇file(文件) >>> setting(設置) >>> Project(項目) >>> python interpreter(python解釋器)
點擊齒輪, 選擇add
添加python安裝路徑
選擇file(文件) >>> setting(設置) >>> Plugins(插件)
點擊 Marketplace 輸入想要安裝的插件名字 比如:翻譯插件 輸入 translation / 漢化插件 輸入 Chinese
選擇相應的插件點擊 install(安裝) 即可
安裝成功之後 是會彈出 重啟pycharm的選項 點擊確定, 重啟即可生效
網頁開發者工具進行抓包分析…
# 導入數據請求模塊
import requests
# 導入正則表達式模塊
import re
# 導入json模塊
import json
# 導入格式化輸出模塊
import pprint
# 導入csv模塊
import csv
# 導入時間模塊
import time
# 導入隨機模塊
import random
# 有沒有用utf-8保存表格數據,亂碼的?
源碼、解答、教程可加Q裙:832157862免費領取
f = open('data多頁_1.csv', mode='a', encoding='utf-8', newline='') # 打開一個文件 data.csv
csv_writer = csv.DictWriter(f, fieldnames=[
'職位',
'城市',
'經驗',
'學歷',
'薪資',
'公司',
'福利待遇',
'公司領域',
'公司規模',
'公司類型',
'發布日期',
'職位詳情頁',
'公司詳情頁',
])
csv_writer.writeheader()
用python代碼模擬浏覽器對於url地址發送請求
不要企圖一節課, 掌握所有內容, 要學習聽懂思路, 每一步我們為什麼這麼做…
知道headers 1
不知道headers 2
headers 請求頭, 作用偽裝python代碼, 偽裝成浏覽器
字典形式, 構建完整鍵值對
如果當你headers偽裝不夠的時候, 你可能會被服務器識別出來, 你是爬蟲程序, 從而不給你相應的數據內容
for page in range(1, 15):
print(f'正在采集第{page}頁的數據內容')
time.sleep(random.randint(1, 2))
url = f'https://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C090200,000000,0000,00,9,99,python,2,{page}.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
print(response) # <Response [200]> 響應對象
源碼、解答、教程可加Q裙:832157862免費領取
得到數據, 不是你想要數據內容, 你可能是被反爬了, 要多加一些偽裝 <小伏筆>
# print(response.text) 字符串數據類型
re.findall() 就是從什麼地方去找什麼樣數據內容
[0] 表示提取列表裡面第一個元素 —> list index out of range 所以你的列表是空列表
用正則表達式/css/xpath提取數據返回是空列表 —> 1. 你語法寫錯 2. response.text 沒有你想要數據
—> 是不是被反爬(驗證碼 需要登陸) 是不是headers參數給少了 是不是被封IP
html_data = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', response.text)[0]
# print(html_data)
json_data = json.loads(html_data)
# pprint.pprint(json_data)
# 通過字典取值方法 把職位信息列表提取出來, 通過for循環遍歷一個一個提取職位信息
for index in json_data['engine_jds']:
# 根據冒號左邊的內容, 提取冒號右邊的內容
# pprint.pprint(index)
try:
dit = {
'職位': index['job_title'],
'城市': index['attribute_text'][0],
'經驗': index['attribute_text'][1],
'學歷': index['attribute_text'][2],
'薪資': index['providesalary_text'],
'公司': index['company_name'],
'福利待遇': index['jobwelf'],
'公司領域': index['companyind_text'],
'公司規模': index['companysize_text'],
'公司類型': index['companytype_text'],
'發布日期': index['issuedate'],
'職位詳情頁': index['job_href'],
'公司詳情頁': index['company_href'],
源碼、解答、教程可加Q裙:832157862免費領取
}
csv_writer.writerow(dit)
print(dit)
except:
pass
----> 爬蟲基本思路是什麼?
數據來源分析
請求響應 請求那個網站呢? 網址是什麼 請求方式是什麼 請求參數要什麼?
發送請求 —> 獲取數據 —> 解析數據 —> 保存數據
import requests
import parsel
url = 'https://jobs.51job.com/shanghai-jdq/137393082.html?s=sou_sou_soulb&t=0_0'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36',
}
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding # 自動識別編碼
print(response.text)
selector = parsel.Selector(response.text)
content_1 = selector.css('.cn').get()
content_2 = selector.css('.tCompany_main').get()
content = content_1 + content_2
# 文件名 公司名字 + 職位名字
with open('python.html', mode='w', encoding='utf-8') as f:
f.write(content)
import pandas as pd
from pyecharts.charts import *
from pyecharts import options as opts
import re
from pyecharts.globals import ThemeType
from pyecharts.commons.utils import JsCode
df = pd.read_csv("招聘數據.csv")
df.head()
df.info()
df['薪資'].unique()
df['bottom']=df['薪資'].str.extract('^(\d+).*')
df['top']=df['薪資'].str.extract('^.*?-(\d+).*')
df['top'].fillna(df['bottom'],inplace=True)
df['commision_pct']=df['薪資'].str.extract('^.*?·(\d{2})薪')
df['commision_pct'].fillna(12,inplace=True)
df['commision_pct']=df['commision_pct'].astype('float64')
df['commision_pct']=df['commision_pct']/12
df.dropna(inplace=True)
源碼、解答、教程可加Q裙:832157862免費領取
df['bottom'] = df['bottom'].astype('int64')
df['top'] = df['top'].astype('int64')
df['平均薪資'] = (df['bottom']+df['top'])/2*df['commision_pct']
df['平均薪資'] = df['平均薪資'].astype('int64')
df.head()
df['薪資'] = df['薪資'].apply(lambda x:re.sub('.*千/月', '0.3-0.7萬/月', x))
df["薪資"].unique()
df['bottom'] = df['薪資'].str.extract('^(.*?)-.*?')
df['top'] = df['薪資'].str.extract('^.*?-(\d\.\d|\d)')
df.dropna(inplace=True)
df['bottom'] = df['bottom'].astype('float64')
df['top'] = df['top'].astype('float64')
df['平均薪資'] = (df['bottom']+df['top'])/2 * 10
df.head()
mean = df.groupby('學歷')['平均薪資'].mean().sort_values()
x = mean.index.tolist()
y = mean.values.tolist()
c = (
Bar()
.add_xaxis(x)
.add_yaxis(
"學歷",
y
)
.set_global_opts(title_opts=opts.TitleOpts(title="不同學歷的平均薪資"),datazoom_opts=opts.DataZoomOpts())
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
c.render_notebook()
color_js = """new echarts.graphic.LinearGradient(0, 1, 0, 0,
[{
offset: 0, color: '#63e6be'}, {
offset: 1, color: '#0b7285'}], false)"""
color_js1 = """new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
offset: 0,
color: '#ed1941'
}, {
offset: 1,
color: '#009ad6'
}], false)"""
dq = df.groupby('城市')['職位'].count().to_frame('數量').sort_values(by='數量',ascending=False).reset_index()
x_data = dq['城市'].values.tolist()[:20]
y_data = dq['數量'].values.tolist()[:20]
b1 = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.DARK,bg_color=JsCode(color_js1),width='1000px',height='600px'))
.add_xaxis(x_data)
.add_yaxis('',
y_data ,
category_gap="50%",
label_opts=opts.LabelOpts(
font_size=12,
color='yellow',
font_weight='bold',
font_family='monospace',
position='insideTop',
formatter = '{b}\n{c}'
),
)
.set_series_opts(
源碼、解答、教程可加Q裙:832157862免費領取
itemstyle_opts={
"normal": {
"color": JsCode(color_js),
"barBorderRadius": [15, 15, 0, 0],
"shadowColor": "rgb(0, 160, 221)",
}
}
)
.set_global_opts(
title_opts=opts.TitleOpts(title='招 聘 數 量 前 20 的 城 市 區 域',
title_textstyle_opts=opts.TextStyleOpts(color="yellow"),
pos_top='7%',pos_left = 'center'
),
legend_opts=opts.LegendOpts(is_show=False),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
yaxis_opts=opts.AxisOpts(name="",
name_location='middle',
name_gap=40,
name_textstyle_opts=opts.TextStyleOpts(font_size=16)),
datazoom_opts=[opts.DataZoomOpts(range_start=1,range_end=50)]
)
)
b1.render_notebook()
boss = df['學歷'].value_counts()
x = boss.index.tolist()
y = boss.values.tolist()
data_pair = [list(z) for z in zip(x, y)]
c = (
Pie(init_opts=opts.InitOpts(width="1000px", height="600px", bg_color="#2c343c"))
.add(
series_name="學歷需求占比",
data_pair=data_pair,
label_opts=opts.LabelOpts(is_show=False, position="center", color="rgba(255, 255, 255, 0.3)"),
)
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
),
label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="學歷需求占比",
pos_left="center",
pos_top="20",
title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
),
legend_opts=opts.LegendOpts(is_show=False),
)
.set_colors(["#D53A35", "#334B5C", "#61A0A8", "#D48265", "#749F83"])
)
c.render_notebook()
boss = df['經驗'].value_counts()
x = boss.index.tolist()
y = boss.values.tolist()
data_pair = [list(z) for z in zip(x, y)]
c = (
Pie(init_opts=opts.InitOpts(width="1000px", height="600px", bg_color="#2c343c"))
.add(
series_name="經驗需求占比",
data_pair=data_pair,
label_opts=opts.LabelOpts(is_show=False, position="center", color="rgba(255, 255, 255, 0.3)"),
)
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
),
label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="經驗需求占比",
pos_left="center",
pos_top="20",
title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
),
legend_opts=opts.LegendOpts(is_show=False),
)
.set_colors(["#D53A35", "#334B5C", "#61A0A8", "#D48265", "#749F83"])
)
c.render_notebook()
boss = df['公司領域'].value_counts()
x = boss.index.tolist()
y = boss.values.tolist()
data_pair = [list(z) for z in zip(x, y)]
c = (
Pie(init_opts=opts.InitOpts(width="1000px", height="600px", bg_color="#2c343c"))
.add(
series_name="公司領域占比",
data_pair=data_pair,
label_opts=opts.LabelOpts(is_show=False, position="center", color="rgba(255, 255, 255, 0.3)"),
)
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
),
label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="公司領域占比",
pos_left="center",
pos_top="20",
title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
),
legend_opts=opts.LegendOpts(is_show=False),
)
.set_colors(["#D53A35", "#334B5C", "#61A0A8", "#D48265", "#749F83"])
)
c.render_notebook()
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker
boss = df['經驗'].value_counts()
x = boss.index.tolist()
y = boss.values.tolist()
data_pair = [list(z) for z in zip(x, y)]
源碼、解答、教程可加Q裙:832157862免費領取
c = (
Pie()
.add("", data_pair)
.set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
.set_global_opts(title_opts=opts.TitleOpts(title="經驗要求占比"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
c.render_notebook()
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker
boss = df['經驗'].value_counts()
x = boss.index.tolist()
y = boss.values.tolist()
data_pair = [list(z) for z in zip(x, y)]
c = (
Pie()
.add(
"",
data_pair,
radius=["40%", "55%"],
label_opts=opts.LabelOpts(
position="outside",
formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c} {per|{d}%} ",
background_color="#eee",
border_color="#aaa",
border_width=1,
border_radius=4,
rich={
"a": {
"color": "#999", "lineHeight": 22, "align": "center"},
"abg": {
"backgroundColor": "#e3e3e3",
"width": "100%",
"align": "right",
"height": 22,
"borderRadius": [4, 4, 0, 0],
},
"hr": {
"borderColor": "#aaa",
"width": "100%",
"borderWidth": 0.5,
"height": 0,
},
"b": {
"fontSize": 16, "lineHeight": 33},
"per": {
"color": "#eee",
"backgroundColor": "#334455",
"padding": [2, 4],
"borderRadius": 2,
},
},
),
)
.set_global_opts(title_opts=opts.TitleOpts(title="Pie-富文本示例"))
)
c.render_notebook()
gsly = df['公司領域'].value_counts()[:10]
x1 = gsly.index.tolist()
y1 = gsly.values.tolist()
c = (
Bar()
.add_xaxis(x1)
.add_yaxis(
"公司領域",
y1
)
.set_global_opts(title_opts=opts.TitleOpts(title="公司領域"),datazoom_opts=opts.DataZoomOpts())
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
c.render_notebook()
gsgm = df['公司規模'].value_counts()[1:10]
x2 = gsgm.index.tolist()
y2 = gsgm.values.tolist()
c = (
Bar()
.add_xaxis(x2)
.add_yaxis(
"公司規模",
y2
)
.set_global_opts(title_opts=opts.TitleOpts(title="公司規模"),datazoom_opts=opts.DataZoomOpts())
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
)
c.render_notebook()
import stylecloud
from PIL import Image
welfares = df['福利'].dropna(how='all').values.tolist()
welfares_list = []
for welfare in welfares:
welfares_list += welfare.split(',')
pic_name = '福利詞雲.png'
stylecloud.gen_stylecloud(
text=' '.join(welfares_list),
font_path='msyh.ttc',
palette='cartocolors.qualitative.Bold_5',
max_font_size=100,
icon_name='fas fa-yen-sign',
background_color='#212529',
output_name=pic_name,
源碼、解答、教程可加Q裙:832157862免費領取
)
Image.open(pic_name)
好了,我的這篇文章寫到這裡就結束啦!
有更多建議或問題可以評論區或私信我哦!一起加油努力叭(ง •_•)ง
喜歡就關注一下博主,或點贊收藏評論一下我的文章叭!!!