您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Python crawler_ Case analysis (I)

編輯：Python

Python Reptiles _ case analysis （ One ）

One 、 Crawl data

# dang.py
import scrapy
class DangSpider(scrapy.Spider):
name = 'dang'
allowed_domains = ['category.dangdang.com/cp01.01.02.00.00.00.html']
start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
def parse(self, response):
# pipelines Download data 
# items Define the data structure 
# src = '//ul[@id="component_59"]/li//img/@src'
# title = '//ul[@id="component_59"]/li//img/@alt'
# price = '//ul[@id="component_59"]/li//p[@class="price"]/span[1]/text()'
# be-all seletor Objects can be called again xpath Method 
li_list = response.xpath('//ul[@id="component_59"]/li')
for li in li_list:
src = li.xpath('.//img/@data-original').extract_first()
# The website has a lazy loading mechanism , So the image path is not src use data-original;
# At the same time, the first picture does not data-original, Use src
if src:
src = src
else:
src = li.xpath('.//img/@src').extract_first()
title = li.xpath('.//img/@alt').extract_first()
price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()
print(src,title,price)

# items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapyDangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# picture 
src = scrapy.Field()
# name 
name = scrapy.Field()
# Price 
price = scrapy.Fiele()

Two 、 Pipe encapsulation

import scrapy
from scrapy_dangdang.items import ScrapyDangdangItem
class DangSpider(scrapy.Spider):
name = 'dang'
allowed_domains = ['category.dangdang.com/cp01.01.02.00.00.00.html']
start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
def parse(self, response):
# pipelines Download data 
# items Define the data structure 
# src = '//ul[@id="component_59"]/li//img/@src'
# title = '//ul[@id="component_59"]/li//img/@alt'
# price = '//ul[@id="component_59"]/li//p[@class="price"]/span[1]/text()'
# be-all seletor Objects can be called again xpath Method 
li_list = response.xpath('//ul[@id="component_59"]/li')
for li in li_list:
src = li.xpath('.//img/@data-original').extract_first()
# The website has a lazy loading mechanism , So the image path is not src use data-original;
# At the same time, the first picture does not data-original, Use src
if src:
src = src
else:
src = li.xpath('.//img/@src').extract_first()
title = li.xpath('.//img/@alt').extract_first()
price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()
book = ScrapyDangdangItem(src=src,title=title,price=price)
# Get one book, will book hand pipelines
yield book

from itemadapter import ItemAdapter
# If you want to use pipes , Must be in settings Open the pipe in 
class ScrapyDangdangPipeline:
# item yes yield hinder book object 
def process_item(self, item, spider):
# Download data This mode is not recommended , Open the file every time you pass a file object , Operate on files too often 
# (1)write Method must write a string , It cannot be any other object 
# (2)w Pattern , The file will be opened once for each object , Overwrite the contents of the previous file 
with open('book.json','a',encoding='utf-8')as fp:
fp.write(str(item))
return item

This mode is not recommended , Open the file every time you pass a file object , Operate on files too often .
Case improvement ：

from itemadapter import ItemAdapter
# If you want to use pipes , Must be in settings Open the pipe in 
class ScrapyDangdangPipeline:
# Execute before the crawler file is executed 
def open_spider(self,spider):
self.fp = open('book.json','w',encoding='utf-8')
# item yes yield hinder book object 
def process_item(self, item, spider):
# Download data This mode is not recommended , Open the file every time you pass a file object , Operate on files too often 
# (1)write Method must write a string , It cannot be any other object 
# (2)w Pattern , The file will be opened once for each object , Overwrite the contents of the previous file 
# with open('book.json','a',encoding='utf-8')as fp:
# fp.write(str(item))
self.fp.write(str(item))
return item
# Execute after the crawler file is executed 
def close_spider(self,spider):
self.fp.close()

3、 ... and 、 Multiple channels to download

import urllib.request
# Multiple pipes open 
# (1) Define pipe class 
# (2) stay settings Open multiple pipelines in : 'scrapy_dangdang.pipelines.DangDangDownloadPipeline':301
class DangDangDownloadPipeline:
def process_item(self, item, spider):
url = 'http:' + item.get('src')
filename = './books/'+ item.get('title') + '.jpg'
urllib.request.urlretrieve(url=url,filename=filename)
return item

Four 、 Multi page download

1.dang.py

import scrapy
from scrapy_dangdang.items import ScrapyDangdangItem
class DangSpider(scrapy.Spider):
name = 'dang'
# If it is multi page download , It has to be adjusted allowed_domains The scope of the , Generally, only the domain name is written 
allowed_domains = ['category.dangdang.com']
start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
base_url = 'http://category.dangdang.com/pg'
page = 1
def parse(self, response):
# pipelines Download data 
# items Define the data structure 
# src = '//ul[@id="component_59"]/li//img/@src'
# title = '//ul[@id="component_59"]/li//img/@alt'
# price = '//ul[@id="component_59"]/li//p[@class="price"]/span[1]/text()'
# be-all seletor Objects can be called again xpath Method 
li_list = response.xpath('//ul[@id="component_59"]/li')
for li in li_list:
src = li.xpath('.//img/@data-original').extract_first()
# The website has a lazy loading mechanism , So the image path is not src use data-original;
# At the same time, the first picture does not data-original, Use src
if src:
src = src
else:
src = li.xpath('.//img/@src').extract_first()
title = li.xpath('.//img/@alt').extract_first()
price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()
book = ScrapyDangdangItem(src=src,title=title,price=price)
# Get one book, will book hand pipelines
yield book
# The business logic of each page is the same , Just call the executed request again parse Method 
# http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html
# http://category.dangdang.com/pg3-cp01.01.02.00.00.00.html
if self.page < 100 :
self.page = self.page + 1
url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'
# How to call parse Method 
# scrapy.Request Namely scrapy Of get request ; url Is the request address ; callback Is the function that needs to be executed , No need to add ()
yield scrapy.Request(url=url,callback=self.parse)

2.items.py

import scrapy
class ScrapyDangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# picture 
src = scrapy.Field()
# name 
title = scrapy.Field()
# Price 
price = scrapy.Field()

3.settings.py

ITEM_PIPELINES = {

# There can be many pipes , Pipelines have priority , The priority range is 1~1000, The smaller the value. , The higher the priority 
'scrapy_dangdang.pipelines.ScrapyDangdangPipeline': 300,
# DangDangDownloadPipeline
'scrapy_dangdang.pipelines.DangDangDownloadPipeline':301
}

4.Pipelines.py

from itemadapter import ItemAdapter
# If you want to use pipes , Must be in settings Open the pipe in 
class ScrapyDangdangPipeline:
# Execute before the crawler file is executed 
def open_spider(self,spider):
self.fp = open('book.json','w',encoding='utf-8')
# item yes yield hinder book object 
def process_item(self, item, spider):
# Download data This mode is not recommended , Open the file every time you pass a file object , Operate on files too often 
# (1)write Method must write a string , It cannot be any other object 
# (2)w Pattern , The file will be opened once for each object , Overwrite the contents of the previous file 
# with open('book.json','a',encoding='utf-8')as fp:
# fp.write(str(item))
self.fp.write(str(item))
return item
# Execute after the crawler file is executed 
def close_spider(self,spider):
self.fp.close()
import urllib.request
# Multiple pipes open 
# (1) Define pipe class 
# (2) stay settings Open multiple pipelines in : 'scrapy_dangdang.pipelines.DangDangDownloadPipeline':301
class DangDangDownloadPipeline:
def process_item(self, item, spider):
url = 'http:' + item.get('src')
filename = './books/'+ item.get('title') + '.jpg'
urllib.request.urlretrieve(url=url,filename=filename)
return item