爬蟲初學者一枚,用java的webmagic框架抓取百度搜索結果,利用servlet傳要搜索的關鍵字,做好了解析和相關的流程調度工作。
現在要爬取百度前10頁返回的信息,每頁有10條,期望返回100條信息,但是每次只返回90條左右,不知道為什麼會有鏈接丟失。望大神解答,關鍵代碼如下:
public void processWithException(Page page) throws MalformedURLException {
Html html = page.getHtml();
if(page.getRequest().getUrl().endsWith("&pn=0&ie=utf8")){
name = (String)page.getRequest().getExtra("name");
List<String> pag = html.xpath("//div[@id='page']/a/@href").all();
if(html.xpath("//div[@id='page']/a/@href").toString()!=null){
if(html.xpath("//div[@id='page']/a/@href").all().size()>=5){
for(int i=0;i<9;i++){
String pagination = pag.get(i);
page.addTargetRequest(pagination);
}
}
else {
for(int i=0;i<html.xpath("//div[@id='page']/a/@href").all().size();i++){
String pagination = pag.get(i);
page.addTargetRequest(pagination);
}
}
}
String eqid = StringUtils.substringBetween(page.getHtml().toString(),"bds.comm.eqid = \"","\";");
List<String> url = html.xpath("//div[@class='c-container']/h3/a/@href").all();
for(int i=0;i<url.size();i++){
String url_temp = url.get(i).replace("http","https")+"&wd=&eqid="+eqid;
page.addTargetRequest(url_temp);
}
}
else if(page.getRequest().getUrl().startsWith("http://www.baidu.com/s?wd=")){
String eqid = StringUtils.substringBetween(page.getHtml().toString(),"bds.comm.eqid = \"","\";");
List<String> url = html.xpath("//div[@class='c-container']/h3/a/@href").all();
for(int i=0;i<url.size();i++){
String url_temp = url.get(i).replace("http","https")+"&wd=&eqid="+eqid;
page.addTargetRequest(url_temp);
}
}
else if(page.getRequest().getUrl().startsWith("https://www.baidu.com/link")){
String url_real = StringUtils.substringBetween(page.getHtml().toString(),"URL=\'","\'");
page.addTargetRequest(url_real);
}
else if(!page.getRequest().getUrl().startsWith("http://www.baidu.com/s?wd=")&&
!page.getRequest().getUrl().startsWith("https://www.baidu.com/link")){
Baidu_FilterUtils baiduFilterUtils = new Baidu_FilterUtils();
baiduFilterUtils.BaiduPassKeyWord(page,name,programList);
baiduFilterUtils.BaiduContainWebsites(page,name,programList_websites);
baiduFilterUtils.BaiduContainDownload(page,name,programList_download);
baiduFilterUtils.BaiduContainsPass(page,name,programList_pass);
}
}
http://ask.seowhy.com/question/16013