程式師世界 >> 編程語言 >> JAVA編程 >> JAVA綜合教程 >> crawler4j：輕量級多線程網絡爬蟲實例，crawler4j爬蟲

crawler4j：輕量級多線程網絡爬蟲實例，crawler4j爬蟲

編輯：JAVA綜合教程

crawler4j：輕量級多線程網絡爬蟲實例，crawler4j爬蟲

crawler4j是Java實現的開源網絡爬蟲。提供了簡單易用的接口，可以在幾分鐘內創建一個多線程網絡爬蟲。

下面實例結合jsoup(中文版API)，javacvs 爬取自如租房網（http://sh.ziroom.com/z/nl/）租房信息。

1.maven導入相關包

 1         <dependency>
 2             <groupId>edu.uci.ics</groupId>
 3             <artifactId>crawler4j</artifactId>
 4             <version>4.2</version>
 5         </dependency>
 6         <dependency>
 7             <groupId>org.jsoup</groupId>
 8             <artifactId>jsoup</artifactId>
 9             <version>1.8.3</version>
10         </dependency>
11         <dependency>
12             <groupId>net.sourceforge.javacsv</groupId>
13             <artifactId>javacsv</artifactId>
14             <version>2.0</version>
15         </dependency>

2.創建自己的Crawler類繼承 WebCrawler

1 public class ZiroomCrawler extends WebCrawler { 2 /** 爬取數據保存文件路徑 */ 3 private final static String CSV_PATH = "data/crawl/ziroom.csv"; 4 /** 爬取匹配原則 */ 5 private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|ico" 6 + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$"); 7 /** 爬取link文件路徑 */ 8 private final static String LINK_PATH = "data/crawl/link.csv"; 9 private static final Logger logger = LoggerFactory.getLogger(ZiroomCrawler.class); 10 11 private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl/"; 12 13 private final File csv; 14 15 private final File csv2; 16 private CsvWriter cw; 17 private CsvWriter cw2; 18 19 /** 20 * You should implement this function to specify whether the given url 21 * should be crawled or not (based on your crawling logic). 22 */ 23 CrawlStat myCrawlStat; 24 25 public ZiroomCrawler() throws IOException { 26 myCrawlStat = new CrawlStat(); 27 csv = new File(CSV_PATH); 28 csv2 = new File(LINK_PATH); 29 if (csv.isFile()) { 30 csv.delete(); 31 } 32 if (csv2.isFile()) { 33 csv2.delete(); 34 } 35 cw2 = new CsvWriter(new FileWriter(csv2, true), ','); 36 cw2.write("請求路徑"); 37 cw2.endRecord(); 38 cw2.close(); 39 cw = new CsvWriter(new FileWriter(csv, true), ','); 40 cw.write("圖片"); 41 cw.write("價格"); 42 cw.write("地址"); 43 cw.write("說明"); 44 cw.endRecord(); 45 cw.close(); 46 } 47 48 public void dumpMyData() { 49 final int id = getMyId(); 50 // You can configure the log to output to file 51 logger.info("Crawler {} > Processed Pages: {}", id, myCrawlStat.getTotalProcessedPages()); 52 logger.info("Crawler {} > Total Links Found: {}", id, myCrawlStat.getTotalLinks()); 53 logger.info("Crawler {} > Total Text Size: {}", id, myCrawlStat.getTotalTextSize()); 54 } 55 56 @Override 57 public Object getMyLocalData() { 58 return myCrawlStat; 59 } 60 61 @Override 62 public void onBeforeExit() { 63 dumpMyData(); 64 } 65 66 /* 67 * 這個方法決定了要抓取的URL及其內容，例子中只允許抓取“http://sh.ziroom.com/z/nl/”這個域的頁面, 68 * 不允許.css、.js和多媒體等文件 69 * 70 * @see edu.uci.ics.crawler4j.crawler.WebCrawler#shouldVisit(edu.uci.ics. 71 * crawler4j.crawler.Page, edu.uci.ics.crawler4j.url.WebURL) 72 */ 73 @Override 74 public boolean shouldVisit(Page referringPage, WebURL url) { 75 final String href = url.getURL().toLowerCase(); 76 77 if (FILTERS.matcher(href).matches() || !href.startsWith(URL_PREFIX)) { 78 return false; 79 } 80 return true; 81 } 82 83 /* 84 * 當URL下載完成會調用這個方法。你可以輕松獲取下載頁面的url, 文本, 鏈接, html,和唯一id等內容。 85 * 86 * @see 87 * edu.uci.ics.crawler4j.crawler.WebCrawler#visit(edu.uci.ics.crawler4j. 88 * crawler.Page) 89 */ 90 @Override 91 public void visit(Page page) { 92 final String url = page.getWebURL().getURL(); 93 System.out.println("-----------爬取路徑：" + url); 94 myCrawlStat.incProcessedPages(); 95 if (page.getParseData() instanceof HtmlParseData) { 96 final HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); 97 final Set<WebURL> links = htmlParseData.getOutgoingUrls(); 98 try { 99 linkToCsv(links); 100 } catch (final IOException e2) { 101 // TODO Auto-generated catch block 102 e2.printStackTrace(); 103 } 104 myCrawlStat.incTotalLinks(links.size()); 105 try { 106 myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("UTF-8").length); 107 } catch (final UnsupportedEncodingException e1) { 108 // TODO Auto-generated catch block 109 e1.printStackTrace(); 110 } 111 final String html = htmlParseData.getHtml(); 112 113 final Document doc = Jsoup.parse(html); 114 115 final Elements contents = doc.select("li[class=clearfix]"); 116 117 for (final Element c : contents) { 118 // 圖片 119 final String img = c.select(".img img").first().attr("src"); 120 System.out.println("圖片：" + img); 121 122 // 地址 123 final Element txt = c.select("div[class=txt]").first(); 124 final String arr1 = txt.select("h3 a").first().text(); 125 final String arr2 = txt.select("h4 a").first().text(); 126 final String arr3 = txt.select("div[class=detail]").first().text(); 127 128 final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3); 129 System.out.println("地址：" + arr); 130 // 說明 131 final String rank = txt.select("p").first().text(); 132 System.out.println("說明：" + rank); 133 134 // 價格 135 final String pirce = c.select("p[class=price]").first().text(); 136 137 try { 138 cw = new CsvWriter(new FileWriter(csv, true), ','); 139 cw.write(img); 140 cw.write(pirce); 141 cw.write(arr); 142 cw.write(rank); 143 cw.endRecord(); 144 cw.flush(); 145 cw.close(); 146 } catch (final IOException e) { 147 e.printStackTrace(); 148 } 149 } 150 } 151 } 152 153 private void linkToCsv(Set<WebURL> links) throws IOException { 154 cw2 = new CsvWriter(new FileWriter(csv2, true), ','); 155 for (final WebURL webURL : links) { 156 cw2.write(webURL.getURL()); 157 } 158 cw2.flush(); 159 cw2.endRecord(); 160 cw2.close(); 161 162 } View Code

public class CrawlStat { private long totalLinks; private int totalProcessedPages; private long totalTextSize; public long getTotalLinks() { return totalLinks; } public int getTotalProcessedPages() { return totalProcessedPages; } public long getTotalTextSize() { return totalTextSize; } public void incProcessedPages() { this.totalProcessedPages++; } public void incTotalLinks(int count) { this.totalLinks += count; } public void incTotalTextSize(int count) { this.totalTextSize += count; } public void setTotalLinks(long totalLinks) { this.totalLinks = totalLinks; } public void setTotalProcessedPages(int totalProcessedPages) { this.totalProcessedPages = totalProcessedPages; } public void setTotalTextSize(long totalTextSize) { this.totalTextSize = totalTextSize; } } View Code

3.編寫運行腳本的類

public class ZiroomController { public static void main(String[] args) { System.out.println("-------begin:" + new Timestamp(System.currentTimeMillis())); final String crawlStorageFolder = "data/crawl/root"; final int numberOfCrawlers = 7; final CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); config.setPolitenessDelay(1000); config.setIncludeBinaryContentInCrawling(false); config.setMaxPagesToFetch(50); // config.setResumableCrawling(true); /* * Instantiate the controller for this crawl. */ final PageFetcher pageFetcher = new PageFetcher(config); final RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller; try { controller = new CrawlController(config, pageFetcher, robotstxtServer); /* * For each crawl, you need to add some seed urls. These are the * first URLs that are fetched and then the crawler starts following * links which are found in these pages */ controller.addSeed("http://sh.ziroom.com/z/nl/"); // controller.addSeed("http://www.ziroom.com/z/nl/z3-u2.html/"); // controller.addSeed("http://www.ics.uci.edu/~welling/"); // controller.addSeed("http://www.ics.uci.edu/"); /* * Start the crawl. This is a blocking operation, meaning that your * code will reach the line after this only when crawling is * finished. */ controller.start(ZiroomCrawler.class, numberOfCrawlers); final List<Object> crawlersLocalData = controller.getCrawlersLocalData(); long totalLinks = 0; long totalTextSize = 0; int totalProcessedPages = 0; for (final Object localData : crawlersLocalData) { final CrawlStat stat = (CrawlStat) localData; totalLinks += stat.getTotalLinks(); totalTextSize += stat.getTotalTextSize(); totalProcessedPages += stat.getTotalProcessedPages(); } System.out.println("Aggregated Statistics:"); System.out.println("\tProcessed Pages: {}" + totalProcessedPages); System.out.println("\tTotal Links found: {}" + totalLinks); System.out.println("\tTotal Text Size: {}" + totalTextSize); } catch (final Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } View Code

剛開始寫博客僅供參考！請多指教！

開源地址： https://github.com/yasserg/crawler4j