初步爬蟲框架
import requests
import re
class MyCrawler:
def __init__(self, filename):
self.filename = filename
def download(self, url):
r = requests.get(url)
return r.text
def extract(self, content, pattern):
result = re.findall(pattern, content)
return result
def save(self, info):
with open(self.filename, 'a') as f:
for item in info:
f.write(item[0] + ' ' + item[1] + '\n')
def crawl(self, url, pattern):
content = self.download(url)
info = self.extract(content, pattern)
self.save(info)
【升級】將User-Agent、headers放入
import requests
import re
class MyCrawler:
def __init__(self, filename):
self.filename = filename
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
}
def download(self, url):
r = requests.get(url, headers = self.headers)
return r.text
def extract(self, content, pattern):
result = re.findall(pattern, content)
return result
def save(self, info):
with open(self.filename, 'a', encoding = 'utf-8') as f:
for item in info:
f.write('|||'.join(item) + '\n')
def crawl(self, url, pattern, headers = None):
if headers:
self.headers.update(headers)
content = self.download(url)
info = self.extract(content, pattern)
self.save(info)
字典形式存儲
【個性化采取爬取douban的數據】
將基類進行改造
class MyDoubanCrawler(MyCrawler):
def extract(self, content, pattern_main, pattern_star):
result = re.findall(pattern_main, content)
for index in range(len(result)):
# for book_info in result:
if 'allstar' in result[index][3]:
items = re.findall(pattern_star, result[index][3])
else:
items = [['0', '0', '0']]
result[index] = list(result[index])
del result[index][3]
result[index].extend(items[0])
# print(result[index])
return result
def crawl(self, url, pattern_main, pattern_star, headers = None):
if headers:
self.headers.update(headers)
content = self.download(url)
info = self.extract(content, pattern_main, pattern_star)
self.save(info)