玩玩Hibernate(二)hibernate-spider爬蟲~~,spider爬蟲
新建一個hSpider的工程,引入前面已經建立的lib
並為其建立一個hibernate.cfg.xml的映射文件
1 <?xml version='1.0' encoding='utf-8'?>
2 <!DOCTYPE hibernate-configuration PUBLIC
3 "-//Hibernate/Hibernate Configuration DTD 3.0//EN"
4 "http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd">
5
6 <hibernate-configuration>
7
8 <session-factory>
9
10 <!-- Database connection settings 數據庫的配置 -->
11 <property name="connection.driver_class">com.mysql.jdbc.Driver</property>
12 <property name="connection.url">jdbc:mysql://localhost:3306/hSpider</property>
13 <property name="connection.username">root</property>
14 <property name="connection.password"></property>
15
16 <!-- JDBC connection pool (use the built-in) hibernate自帶連接池,暫不使用 -->
17 <!-- <property name="connection.pool_size">1</property> -->
18
19 <!-- SQL dialect 數據庫方言,這裡我們才愛用MySQL-->
20 <property name="dialect">org.hibernate.dialect.MySQLDialect</property>
21
22 <!-- Enable Hibernate's automatic session context management 新功能,暫不使用 -->
23 <!-- <property name="current_session_context_class">thread</property> -->
24
25 <!-- Disable the second-level cache 二級緩存,放置不管 -->
26 <property name="cache.provider_class">org.hibernate.cache.NoCacheProvider</property>
27
28 <!-- Echo all executed SQL to stdout 設置show_sql為true表示讓hibernate將生成sql語句在控制台打印出來 -->
29 <property name="show_sql">true</property>
30
31 <!-- Drop and re-create the database schema on startup 是否讓hibernate自動為我們創建表 -->
32 <!-- <property name="hbm2ddl.auto">update</property> -->
33
34 <mapping resource="hibernateSpider/edNews.hbm.xml"/> <!-- 這裡是將需要mapping的文件進行再次聲明 -->
35
36 </session-factory>
37
38 </hibernate-configuration>
新建`hSpider`包依次點擊打開HibernateSpider->右鍵src->New->Package
新建`edNews`類依次點擊打開HibernateSpider->src->hSpider->New->Class
![](https://www.aspphp.online/bianchen/UploadFiles_4619/201701/2017011815344208.gif)
![]()
public class edNews {
private int id;
private String ednews;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getNews(){
return ednews;
}
public void setNews(news ednews){
this.ednews = ednews.ednews;
}
}
edNews
並為其新建一個edNews.hbm.xml映射文件(必須跟edNEws在同一個包中)
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE hibernate-mapping PUBLIC
"-//Hibernate/Hibernate Mapping DTD 3.0//EN"
"http://hibernate.sourceforge.net/hibernate-mapping-3.0.dtd">
<hibernate-mapping>
<class name="hibernateSpider.edNews" table="ednews">
<id name="id" type="int">
<column name="id" />
<generator class="increment" />
</id>
<property name="news" type="string">
<column name="news" length="255" />
</property>
</class>
</hibernate-mapping>
新建一個news類(用於顯示)
![](https://www.aspphp.online/bianchen/UploadFiles_4619/201701/2017011815344208.gif)
![]()
1 public class news {
2
3 public String ednews;
4
5 // 構造方法初始化數據
6 public news() {
7 ednews = "";
8 }
9
10 @Override
11 public String toString() {
12 return "公告:" + ednews + "\n";
13 }
14 }
News
新建一個Spider類,這個是爬蟲代碼的實現
![](https://www.aspphp.online/bianchen/UploadFiles_4619/201701/2017011815344208.gif)
![]()
1 package hibernateSpider;
2
3 import java.io.BufferedReader;
4 import java.io.InputStreamReader;
5 import java.net.URL;
6 import java.net.URLConnection;
7 import java.util.ArrayList;
8 import java.util.regex.Matcher;
9 import java.util.regex.Pattern;
10
11 public class Spider {
12 public static String SendGet(String url) {
13 // 定義一個字符串用來存儲網頁內容
14 String result = "";
15 // 定義一個緩沖字符輸入流
16 BufferedReader in = null;
17
18 try {
19 // 將string轉成url對象
20 URL realUrl = new URL(url);
21 // 初始化一個鏈接到那個url的連接
22 URLConnection connection = realUrl.openConnection();
23 // 開始實際的連接
24 connection.connect();
25 // 初始化 BufferedReader輸入流來讀取URL的響應
26 in = new BufferedReader(new InputStreamReader(
27 connection.getInputStream(), "UTF-8"));
28 // 用來臨時存儲抓取到的每一行的數據
29 String line;
30 while ((line = in.readLine()) != null) {
31 // 遍歷抓取到的每一行並將其存儲到result裡面
32 result += line;
33 }
34 } catch (Exception e) {
35 System.out.println("發送GET請求出現異常!" + e);
36 e.printStackTrace();
37 }
38 // 使用finally來關閉輸入流
39 finally {
40 try {
41 if (in != null) {
42 in.close();
43 }
44 } catch (Exception e2) {
45 e2.printStackTrace();
46 }
47 }
48 return result;
49
50 }
51
52 public static ArrayList<news> GetNews(String content) {
53 // 預定義一個ArrayList來存儲結果
54 ArrayList<news> results = new ArrayList<news>();
55 // 用來匹配標題
56 Pattern questionPattern = Pattern.compile("ggtz/\\d{4}.shtml.+?>(.+?)<");
57 Matcher questionMatcher = questionPattern.matcher(content);
58 // 用來匹配url,也就是問題的鏈接
59 Pattern urlPattern = Pattern.compile("ggtz/\\d{4}.shtml.+?>(.+?)<");
60 Matcher urlMatcher = urlPattern.matcher(content);
61
62 // 問題和鏈接要均能匹配到
63 boolean isFind = questionMatcher.find() && urlMatcher.find();
64
65 while (isFind) {
66 // 定義一個news對象(公告對象)來存儲抓取到的信息
67 news newsTemp = new news();
68 newsTemp.ednews= questionMatcher.group(1);
69
70
71 // 添加成功匹配的結果
72 results.add(newsTemp);
73 // 繼續查找下一個匹配對象
74 isFind = questionMatcher.find() && urlMatcher.find();
75 }
76 return results;
77 }
78
79
80 }
Spider
最後,測試一下結果
![](https://www.aspphp.online/bianchen/UploadFiles_4619/201701/2017011815344208.gif)
![]()
1 public class MainTest {
2
3
4 public static void main(String[] args) {
5
6 // 定義即將訪問的鏈接
7
8 String url = "http://jwc.gdut.edu.cn/";
9 // 訪問鏈接並獲取頁面內容
10 String content = Spider.SendGet(url);
11 // 獲取該頁面的所有的命題對象
12 ArrayList<news> myNews = Spider. GetNews(content);
13 // 打印結果
14 for(int i = 0; i < myNews.size(); i++){
15 System.out.println(myNews.get(i));
16
17 edNews aNew = new edNews() ;//新建我們需要存儲的類對象,並且設置其對象的一些屬性
18 aNew.setId(i);
19 aNew.setNews(myNews.get(i));
20
21 {
22 //Configuration主要用以讀取配置文件
23 Configuration cfg = new Configuration();
24 SessionFactory sf = cfg.configure().buildSessionFactory();
25 //buildSessionFactory();得到一個創建Session的工場
26 Session ss = sf.openSession();
27 ss.beginTransaction();//OK,將操作放入事務中
28 ss.save(aNew);//保存你的對象
29 ss.getTransaction().commit();//得到事務並提交
30
31 ss.close();//Session關閉
32 sf.close();//工廠關閉
33
34
35 }
36 }
37 }
38 }
MainTest
![](https://www.aspphp.online/bianchen/UploadFiles_4619/201701/2017011815344227.png)
![](https://www.aspphp.online/bianchen/UploadFiles_4619/201701/2017011815344865.png)