最近在做淘寶U站,因為參加一個比賽,每天看自己的票數太麻煩,干脆寫了一個抓取程序,下面的程序是抓取的代碼
# -*- coding: utf-8 -*-
#!/usr/bin/python
# Filename : uz.py
import re
import urllib2,string
import thread, time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class Spider(object):
"""docstring for Spider"""
def __init__(self):
super(Spider, self).__init__()
self.enale = True
self.page = 1
self.url = 'http://display.taobao.com/chengling/voteusitemr.htm?perPageSize=12¤tRand=95&toPage='
self.post = ''
def getPage(self,num):
global Allitems
url = self.url+str(num)
response = urllib2.urlopen(url)
responsePage = response.read()
unicodePage = responsePage.decode('utf-8')
myItems = re.findall('
- .*?.*?.*?.*?.*?(.*?).*?.*?.*?',unicodePage,re.S)
#print myItems
items = []
for item in myItems:
# item 分別對應是url,名稱,票數
Allitems.append([item[0].replace("\n",""),item[1].replace("\n",""),item[2].replace("\n","")])
#return items
def loadPage(self,num):
global lock
lock.acquire() #獲取瑣
print u"正在加載第%d頁內容請稍候..." % num
self.post = self.getPage(num)
lock.release() #釋放瑣
def Start(self):
for x in xrange(1,29):
#print x
thread.start_new_thread(self.loadPage,(x,))
time.sleep(1)
print u'開始統計:'
Allitems=[]
lock = thread.allocate_lock() #創建一個瑣對象
Spider = Spider()
Spider.Start()
print u'正在排序請稍等...'
time.sleep(1)
Allitems = sorted(Allitems, key=lambda items : items[2],reverse=True)
#f=open('uz.html','w+')
key = 1
for item in Allitems:
if int(item[2]) >=100:
#f.write('第'+str(key)+'名:'+item[2]+'票--'+item[1]+item[0]+"\n")
print u'第'+str(key)+'名:'+item[2]+'票--'+item[1]+item[0]
key +=1
#f.close()
print len(Allitems)
print u'按回車退出'
raw_input(' ')