python Crawling through massive expression packs
Corresponding installation package / Installation tutorial / Activation code / Use the tutorial / Learning materials / Tool plugins You can click collect
pip Not an internal command
Set the environment variable
There are a lot of red reports (read time out)
Because the network link timed out , You need to switch the mirror source
tsinghua :https://pypi.tuna.tsinghua.edu.cn/simple
Alibaba cloud :http://mirrors.aliyun.com/pypi/simple/
University of science and technology of China https://pypi.mirrors.ustc.edu.cn/simple/
Huazhong University of technology :http://pypi.hustunique.com/
Shandong University of technology :http://pypi.sdutlinux.org/
douban :http://pypi.douban.com/simple/
for example :pip3 install -i https://pypi.doubanio.com/simple/ Module name
cmd It shows that it has been installed , Or the installation is successful , But in pycharm It still can't be imported
Multiple... May be installed python edition (anaconda perhaps python Just install one ) Just uninstall one
Or you pycharm Inside python The interpreter is not set
import requests # Data request module Third-party module pip install requests The module is not used after installation Gray shows
import parsel # Data analysis module Third-party module pip install parsel
import re # Regular expressions Built-in module No installation required
# 1. Send a request
# You need to pay attention to : Confirm the request url Address Request method Request header parameters ( Some websites need to add cookie perhaps Anti theft chain )
for page in range(12, 21):
print(f' Climbing to the top {page} Page data content ')
url = f'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
# headers Functional camouflage python Code A crawler simulates a browser sending a request to a server
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
# 2. get data , Get the data content returned by the server (response Response body data ) response.text Get the text data of the response body
# 3. Parsing data Extract content , It is based on the data returned by the server , Instead of looking at the element panel
# print(response.text) # Got response.text This html String data Directly extract the string data content Need to use re Regular expressions
selector = parsel.Selector(response.text) # Put the obtained string data content convert to selector object
# css Selectors It is to extract data according to the tag attribute content
divs = selector.css('div.ui.segment.imghover div') # Get all div label Returns the object
for index in divs:
# a::attr(title) obtain a Inside the label title attribute data get() Get the first tag data content
title = index.css('a::attr(title)').get()
title = re.sub(r'[\/:*?"<>|\n]', '', title)
img_url = index.css('img::attr(data-original)').get()
# split String segmentation method list [-1] Take the last element The first element on the right
img_name = img_url.split('.')[-1]
# response.content Get binary data content , Save the picture / video / Audio / File content in a specific format Are stored in binary data
img_content = requests.get(url=img_url, headers=headers).content
with open('img\\' + title + '.' + img_name, mode='wb') as f:
f.write(img_content)
print(title, ' Saved successfully ')
import re
import time
import requests
import parsel
import concurrent.futures
def change_title(title):
mode = re.compile(r'[\\\/\:\*\?\"\<\>\|\n]')
new_title = re.sub(mode, '_', title)
return new_title
def get_response(html_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
}
response = requests.get(url=html_url, headers=headers)
return response
def save(name, title, img_url):
img_content = get_response(img_url).content
with open('img\\' + title + '.' + name, mode='wb') as f:
f.write(img_content)
print(' Saving :', title)
def main(html_url):
html_data = get_response(html_url).text
selector = parsel.Selector(html_data)
divs = selector.css('#container div.tagbqppdiv')
for div in divs:
title = div.css('img::attr(title)').get()
img_url = div.css('img::attr(data-original)').get()
name = img_url.split('.')[-1]
new_title = change_title(title)
if len(new_title) > 255:
new_title = new_title[:10]
save(name, new_title, img_url)
else:
save(name, new_title, img_url)
if __name__ == '__main__':
start_time = time.time()
exe = concurrent.futures.ThreadPoolExecutor(max_workers=7)
for page in range(1, 201):
url = f'https://www.fabiaoqing.com/biaoqing/lists/page/{page}.html'
exe.submit(main, url)
exe.shutdown()
use_time = int(time.time()) - int(start_time)
print(f' Total time taken :{use_time} second ')