整理了一下,之前的有些亂看不清楚,這是我其中一個抓取頁面的代碼,做一個備份,方便以後調用,本人能力有限希望大家多給點意見
public function snatch()
{
set_time_limit(0);
$this->benchmark->mark('code_start');
/*獲取不同類別的二手車新聞*/
for($i=1;$i<=4;$i++)
{
$url = 'http://news.2sche.cn/list.asp?stype='.$i;
$result = $this->curl_snatch($url);
preg_match_all('/<strong>\d\/(.*?)<\/strong>/', $result, $page_news);
//print_r($page_news);
//echo '<hr>';
/*獲取單個類別下所有分頁頁面的新聞列表*/
for($j=1;$j<=$page_news[1];$j++)
{
if(1 == $j)
{
$url_news = 'http://news.2sche.cn/list.asp?stype='.$i;
}
else
{
$url_news = 'http://news.2sche.cn/list.asp?page='.$j.'&stype='.$i;
}
$result_news = $this->curl_snatch($url_news);
preg_match_all('/<td width="516" height="28" class="z14"><a href="(.*?)" target="_blank">.*?<\/a><\/td>/sim', $result_news, $url_newslist);
//print_r($url_newslist);
/*遍歷列表頁每個url*/
foreach($url_newslist[1] as $url_newslists)
{
$url_newsinfo = 'http://news.2sche.cn/'.$url_newslists;
$result_newsinfo = $this->curl_snatch($url_newsinfo);
/*獲取標題*/
preg_match_all('/<h3 class="title"><strong>(.*?)<\/strong><\/h3>/sim', $result_newsinfo, $title);
//print_r($title[1]);
/*獲取來源*/
preg_match_all('/<td style="BORDER-BOTTOM: #666666 1PX DASHED" width="155"><span class="right">【來源:(.*?) 】<\/span><\/td>/sim', $result_newsinfo, $source);
//print_r($source[1]);
/*獲取內容*/
preg_match_all('/<td colspan="2" class="z14" style="padding-top:20px;padding-left:1px;padding-bottom:20px;line-height:25px">(.*?)<\/td>/sim', $result_newsinfo, $content);
//print_r($content[1][0]);
/*獲取內容裡的所有圖片url*/
//preg_match_all('/<IMG alt="" src="(.*?)">/sim', $content[1][0], $img);
preg_match_all('/<IMG.*?src="(.*?)".*?>/sim', $content[1][0], $img);
//echo 'ddddd';
//print_r($img[1]);
//echo 'dddd<br>';
//exit;
$picture = '';
foreach($img[1] as $imgs)
{
//echo $imgs;
//echo '<br>';
if(strpos($imgs, 'http://') === false)
{
continue;
}
$img_source = file_get_contents($imgs);
/*獲取單個圖片的名稱*/
$img_names = trim(strrchr($imgs,'/'), '/');
//print_r($img_name);
//echo $img_names;
//exit;
$picture .= $img_names.':';
file_put_contents("./static/uploads/news/".$img_names, $img_source);
//圖片路徑替換
$img_path = '/static/uploads/news/'.$img_names;
$content[1][0] = str_replace($imgs, $img_path, $content[1][0]);
}
//print_r($picture) ;
//echo 'hhhh<br>';
//print_r($content[1][0]);
//echo '<br>';
$data = array(
'title' => $title[1][0],
'source' => $source[1][0],
'contents' => trim($content[1][0]),
'picture' => $picture,
'style' => $i,
'create_time' => time(),
);
if(!$this->News_model->add($data))
{
continue;
}
//print_r($data);exit;
}
echo '<hr>';
}
}
$this->benchmark->mark('code_end');
echo $this->benchmark->elapsed_time('code_start', 'code_end');
}
function curl_snatch($url='http://www.2sche.cn/buy.asp')
{
$url = trim($url);
$content = '';
if (extension_loaded('curl'))
{
$ch = curl_init();
// 2. 設置選項,包括URL
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
// 3. 執行並獲取HTML文檔內容
$output = curl_exec($ch);
$content = iconv("GBK", "UTF-8", $output);
if ($output === FALSE) {
echo "cURL Error: " . curl_error($ch);
}
//$info = curl_getinfo($ch);
//echo '獲取'. $info['url'] . '耗時'. $info['total_time'] . '秒';
// 4. 釋放curl句柄
curl_close($ch);
}
else
{
$content = file_get_contents($url);
}
return trim($content);
}