crawler4j:輕量級多線程網絡爬蟲實例,crawler4j爬蟲
crawler4j是Java實現的開源網絡爬蟲。提供了簡單易用的接口,可以在幾分鐘內創建一個多線程網絡爬蟲。
下面實例結合jsoup(中文版API),javacvs 爬取自如租房網(http://sh.ziroom.com/z/nl/)租房信息。
1.maven導入相關包
1 <dependency>
2 <groupId>edu.uci.ics</groupId>
3 <artifactId>crawler4j</artifactId>
4 <version>4.2</version>
5 </dependency>
6 <dependency>
7 <groupId>org.jsoup</groupId>
8 <artifactId>jsoup</artifactId>
9 <version>1.8.3</version>
10 </dependency>
11 <dependency>
12 <groupId>net.sourceforge.javacsv</groupId>
13 <artifactId>javacsv</artifactId>
14 <version>2.0</version>
15 </dependency>
2.創建自己的Crawler類 繼承 WebCrawler

![]()
1 public class ZiroomCrawler extends WebCrawler {
2 /** 爬取數據保存文件路徑 */
3 private final static String CSV_PATH = "data/crawl/ziroom.csv";
4 /** 爬取匹配原則 */
5 private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|ico"
6 + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
7 /** 爬取link文件路徑 */
8 private final static String LINK_PATH = "data/crawl/link.csv";
9 private static final Logger logger = LoggerFactory.getLogger(ZiroomCrawler.class);
10
11 private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl/";
12
13 private final File csv;
14
15 private final File csv2;
16 private CsvWriter cw;
17 private CsvWriter cw2;
18
19 /**
20 * You should implement this function to specify whether the given url
21 * should be crawled or not (based on your crawling logic).
22 */
23 CrawlStat myCrawlStat;
24
25 public ZiroomCrawler() throws IOException {
26 myCrawlStat = new CrawlStat();
27 csv = new File(CSV_PATH);
28 csv2 = new File(LINK_PATH);
29 if (csv.isFile()) {
30 csv.delete();
31 }
32 if (csv2.isFile()) {
33 csv2.delete();
34 }
35 cw2 = new CsvWriter(new FileWriter(csv2, true), ',');
36 cw2.write("請求路徑");
37 cw2.endRecord();
38 cw2.close();
39 cw = new CsvWriter(new FileWriter(csv, true), ',');
40 cw.write("圖片");
41 cw.write("價格");
42 cw.write("地址");
43 cw.write("說明");
44 cw.endRecord();
45 cw.close();
46 }
47
48 public void dumpMyData() {
49 final int id = getMyId();
50 // You can configure the log to output to file
51 logger.info("Crawler {} > Processed Pages: {}", id, myCrawlStat.getTotalProcessedPages());
52 logger.info("Crawler {} > Total Links Found: {}", id, myCrawlStat.getTotalLinks());
53 logger.info("Crawler {} > Total Text Size: {}", id, myCrawlStat.getTotalTextSize());
54 }
55
56 @Override
57 public Object getMyLocalData() {
58 return myCrawlStat;
59 }
60
61 @Override
62 public void onBeforeExit() {
63 dumpMyData();
64 }
65
66 /*
67 * 這個方法決定了要抓取的URL及其內容,例子中只允許抓取“http://sh.ziroom.com/z/nl/”這個域的頁面,
68 * 不允許.css、.js和多媒體等文件
69 *
70 * @see edu.uci.ics.crawler4j.crawler.WebCrawler#shouldVisit(edu.uci.ics.
71 * crawler4j.crawler.Page, edu.uci.ics.crawler4j.url.WebURL)
72 */
73 @Override
74 public boolean shouldVisit(Page referringPage, WebURL url) {
75 final String href = url.getURL().toLowerCase();
76
77 if (FILTERS.matcher(href).matches() || !href.startsWith(URL_PREFIX)) {
78 return false;
79 }
80 return true;
81 }
82
83 /*
84 * 當URL下載完成會調用這個方法。你可以輕松獲取下載頁面的url, 文本, 鏈接, html,和唯一id等內容。
85 *
86 * @see
87 * edu.uci.ics.crawler4j.crawler.WebCrawler#visit(edu.uci.ics.crawler4j.
88 * crawler.Page)
89 */
90 @Override
91 public void visit(Page page) {
92 final String url = page.getWebURL().getURL();
93 System.out.println("-----------爬取路徑:" + url);
94 myCrawlStat.incProcessedPages();
95 if (page.getParseData() instanceof HtmlParseData) {
96 final HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
97 final Set<WebURL> links = htmlParseData.getOutgoingUrls();
98 try {
99 linkToCsv(links);
100 } catch (final IOException e2) {
101 // TODO Auto-generated catch block
102 e2.printStackTrace();
103 }
104 myCrawlStat.incTotalLinks(links.size());
105 try {
106 myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("UTF-8").length);
107 } catch (final UnsupportedEncodingException e1) {
108 // TODO Auto-generated catch block
109 e1.printStackTrace();
110 }
111 final String html = htmlParseData.getHtml();
112
113 final Document doc = Jsoup.parse(html);
114
115 final Elements contents = doc.select("li[class=clearfix]");
116
117 for (final Element c : contents) {
118 // 圖片
119 final String img = c.select(".img img").first().attr("src");
120 System.out.println("圖片:" + img);
121
122 // 地址
123 final Element txt = c.select("div[class=txt]").first();
124 final String arr1 = txt.select("h3 a").first().text();
125 final String arr2 = txt.select("h4 a").first().text();
126 final String arr3 = txt.select("div[class=detail]").first().text();
127
128 final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3);
129 System.out.println("地址:" + arr);
130 // 說明
131 final String rank = txt.select("p").first().text();
132 System.out.println("說明:" + rank);
133
134 // 價格
135 final String pirce = c.select("p[class=price]").first().text();
136
137 try {
138 cw = new CsvWriter(new FileWriter(csv, true), ',');
139 cw.write(img);
140 cw.write(pirce);
141 cw.write(arr);
142 cw.write(rank);
143 cw.endRecord();
144 cw.flush();
145 cw.close();
146 } catch (final IOException e) {
147 e.printStackTrace();
148 }
149 }
150 }
151 }
152
153 private void linkToCsv(Set<WebURL> links) throws IOException {
154 cw2 = new CsvWriter(new FileWriter(csv2, true), ',');
155 for (final WebURL webURL : links) {
156 cw2.write(webURL.getURL());
157 }
158 cw2.flush();
159 cw2.endRecord();
160 cw2.close();
161
162 }
View Code

![]()
public class CrawlStat {
private long totalLinks;
private int totalProcessedPages;
private long totalTextSize;
public long getTotalLinks() {
return totalLinks;
}
public int getTotalProcessedPages() {
return totalProcessedPages;
}
public long getTotalTextSize() {
return totalTextSize;
}
public void incProcessedPages() {
this.totalProcessedPages++;
}
public void incTotalLinks(int count) {
this.totalLinks += count;
}
public void incTotalTextSize(int count) {
this.totalTextSize += count;
}
public void setTotalLinks(long totalLinks) {
this.totalLinks = totalLinks;
}
public void setTotalProcessedPages(int totalProcessedPages) {
this.totalProcessedPages = totalProcessedPages;
}
public void setTotalTextSize(long totalTextSize) {
this.totalTextSize = totalTextSize;
}
}
View Code
3.編寫運行腳本的類

![]()
public class ZiroomController {
public static void main(String[] args) {
System.out.println("-------begin:" + new Timestamp(System.currentTimeMillis()));
final String crawlStorageFolder = "data/crawl/root";
final int numberOfCrawlers = 7;
final CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
config.setIncludeBinaryContentInCrawling(false);
config.setMaxPagesToFetch(50);
// config.setResumableCrawling(true);
/*
* Instantiate the controller for this crawl.
*/
final PageFetcher pageFetcher = new PageFetcher(config);
final RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller;
try {
controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the
* first URLs that are fetched and then the crawler starts following
* links which are found in these pages
*/
controller.addSeed("http://sh.ziroom.com/z/nl/");
// controller.addSeed("http://www.ziroom.com/z/nl/z3-u2.html/");
// controller.addSeed("http://www.ics.uci.edu/~welling/");
// controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your
* code will reach the line after this only when crawling is
* finished.
*/
controller.start(ZiroomCrawler.class, numberOfCrawlers);
final List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for (final Object localData : crawlersLocalData) {
final CrawlStat stat = (CrawlStat) localData;
totalLinks += stat.getTotalLinks();
totalTextSize += stat.getTotalTextSize();
totalProcessedPages += stat.getTotalProcessedPages();
}
System.out.println("Aggregated Statistics:");
System.out.println("\tProcessed Pages: {}" + totalProcessedPages);
System.out.println("\tTotal Links found: {}" + totalLinks);
System.out.println("\tTotal Text Size: {}" + totalTextSize);
} catch (final Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
View Code
剛開始寫博客 僅供參考!請多指教!
開源地址: https://github.com/yasserg/crawler4j