程式師世界 >> 編程語言 >> C語言 >> C >> 關於C >> 用CI框架寫的抓取文章標題、內容、來源圖片代碼

用CI框架寫的抓取文章標題、內容、來源圖片代碼

編輯：關於C

整理了一下，之前的有些亂看不清楚，這是我其中一個抓取頁面的代碼，做一個備份，方便以後調用，本人能力有限希望大家多給點意見

public function snatch()
{
 set_time_limit(0);
 $this->benchmark->mark('code_start');
 /*獲取不同類別的二手車新聞*/
 for($i=1;$i<=4;$i++)
 {
 $url = 'http://news.2sche.cn/list.asp?stype='.$i;
 $result = $this->curl_snatch($url);

 preg_match_all('/\d\/(.*?)<\/strong>/', $result, $page_news);
 //print_r($page_news);
 //echo '<hr>';
 /*獲取單個類別下所有分頁頁面的新聞列表*/
 for($j=1;$j<=$page_news[1];$j++)
 {
 if(1 == $j)
 {
 $url_news = 'http://news.2sche.cn/list.asp?stype='.$i;
 }
 else
 {
 $url_news = 'http://news.2sche.cn/list.asp?page='.$j.'&stype='.$i;
 }
 $result_news = $this->curl_snatch($url_news);
 preg_match_all('/<td width="516" height="28" class="z14"><a href="(.*?)" target="_blank">.*?<\/a><\/td>/sim', $result_news, $url_newslist);
 //print_r($url_newslist);
 /*遍歷列表頁每個url*/
 foreach($url_newslist[1] as $url_newslists)
 {
 $url_newsinfo = 'http://news.2sche.cn/'.$url_newslists;
 $result_newsinfo = $this->curl_snatch($url_newsinfo);
 /*獲取標題*/
 preg_match_all('/<h3 class="title">(.*?)<\/strong><\/h3>/sim', $result_newsinfo, $title);
 //print_r($title[1]);
 /*獲取來源*/
 preg_match_all('/<td style="BORDER-BOTTOM: #666666 1PX DASHED" width="155">【來源：(.*?) 】<\/span><\/td>/sim', $result_newsinfo, $source);
 //print_r($source[1]);
 /*獲取內容*/
 preg_match_all('/<td colspan="2" class="z14" style="padding-top:20px;padding-left:1px;padding-bottom:20px;line-height:25px">(.*?)<\/td>/sim', $result_newsinfo, $content);
 //print_r($content[1][0]);
 /*獲取內容裡的所有圖片url*/
 //preg_match_all('/<IMG alt="" src="(.*?)">/sim', $content[1][0], $img);
 preg_match_all('/<IMG.*?src="(.*?)".*?>/sim', $content[1][0], $img);
 //echo 'ddddd';
 //print_r($img[1]);
 //echo 'dddd ';
 //exit;
 $picture = '';
 foreach($img[1] as $imgs)
 {
 //echo $imgs;
 //echo ' ';
 if(strpos($imgs, 'http://') === false)
 {
 continue;
 }
 $img_source = file_get_contents($imgs);

 /*獲取單個圖片的名稱*/

 $img_names = trim(strrchr($imgs,'/'), '/');
 //print_r($img_name);

 //echo $img_names;
 //exit;
 $picture .= $img_names.':';
 file_put_contents("./static/uploads/news/".$img_names, $img_source);

 //圖片路徑替換
 $img_path = '/static/uploads/news/'.$img_names;
 $content[1][0] = str_replace($imgs, $img_path, $content[1][0]);
 }
 //print_r($picture) ;
 //echo 'hhhh ';
 //print_r($content[1][0]);

 //echo ' ';
 $data = array(
 'title' => $title[1][0],
 'source' => $source[1][0],
 'contents' => trim($content[1][0]),
 'picture' => $picture,
 'style' => $i,
 'create_time' => time(),
 );
 if(!$this->News_model->add($data))
 {
 continue;
 }
 //print_r($data);exit;
 }
 echo '<hr>';

 }
}
$this->benchmark->mark('code_end');
echo $this->benchmark->elapsed_time('code_start', 'code_end');

}