<?php
/**
* 抓取“中國 IC 網(http://www.ic37.com)”供應商主程序
* author Lee.
* Last modify $Date: 2012-2-9 9:32:21 $
* 注:本程序按照編碼 GB2312 執行,因為“中國 IC 網”網站是GB2312編碼,數據庫也得保持一致
*/
class ic37 {
private $key; // 型號
private $pageNum; // 頁碼
/**
* 入口程序
*/
public function go($key) {
$this->key = $key;
$this->pageNum = $this->getPageNum();
$this->getInfo();
}
/**
* 獲取供應商 url 鏈接數組
* @return ArrayObject
*/
private function getInfo() {
if ($this->pageNum==1) { # 處理只有一頁的情況
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
$this->isAddSuccess($arr);
} elseif ($this->pageNum>1) { # 多頁
for ($i=1; $i<=$this->pageNum; $i++) {
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
$this->isAddSuccess($arr);
}
}
}
/**
* 打印是否添加成功
* @param ArrayObject $arr
* @return string
*/
private function isAddSuccess($arr) {
foreach ($arr as $k=>$v) {
if ($this->execAdd($this->getInfoByShopUrl($v))) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}
/**
* 執行添加到數據庫
* @param ArrayObject $infoArr
* @return Number 受影響的行數
*/
private function execAdd($infoArr) {
$mysqli = $this->getDb();
if (!emptyempty($infoArr['company'])) {
if (!$this->isExists($mysqli, $infoArr)) {
$num = $mysqli->query("INSERT INTO ic37(company,person,phone,mobile,qq,msn,fax,email,address,country,region,zip,web,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['person']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['fax']}','{$infoArr['email']}','{$infoArr['address']}','{$infoArr['country']}','{$infoArr['region']}','{$infoArr['zip']}','{$infoArr['web']}','{$infoArr['shopUrl']}')");
return $num;
} else {
return false; # 表示數據已經存在
}
} else {
return false;
}
}
private function formatStr($str) {
$str = trim($str);
$str = str_replace(' ', '', $str);
$str = str_replace('==聯系我們', '', $str);
return $str;
}
/**
* 連接數據庫
*/
private function getDb() {
$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');
$mysqli->query('SET NAMES GB2312');
return $mysqli;
}
/**
* 檢查公司是否已經存在
* @param Resource $mysqli
* @param ArrayObject $infoArr
* @return bool
*/
private function isExists($mysqli, $infoArr) {
$mysqli->query("SELECT company FROM weiku WHERE company = '{$infoArr['company']}'");
if ($mysqli->affected_rows) {
return true;
} else {
return false;
}
}
/**
* 抓取信息
* @param $url
* @return ArrayObject
*/
private function getInfoByShopUrl($url) {
$re = preg_replace('/<a.+>(.*)<\/a>/', '\1', str_replace('</font>', '', str_replace('<font color="#000099">', '', $this->getUrlInfo($url))));
preg_match_all('/<title>(.*)<\/title>/Usi', $re, $companyArr);
preg_match_all('/<strong>聯系人:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $personArr);
preg_match_all('/<strong>電話:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $phoneArr);
preg_match_all('/<strong>手機:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $mobileArr);
preg_match_all('/<strong>QQ:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $qqArr);
preg_match_all('/<strong>MSN:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $msnArr);
preg_match_all('/<strong>傳真:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $faxArr);
preg_match_all('/<strong>EMail:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $emailArr);
preg_match_all('/司地址[:]*[<\/strong>]*[<strong>]*[:]*[<\/strong>]*<\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $addressArr);
preg_match_all('/<strong>國家[:]*<\/strong>[<strong>]*[:]*[<\/strong>]*<\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $countryArr);
preg_match_all('/<strong>地區:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $regionArr);
preg_match_all('/<strong>郵政編碼:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $zipArr);
preg_match_all('/<strong>\s*網址[1]*:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $webArr);
$infoArr = array(
'company'=>$this->formatStr($companyArr[1][0]),
'person'=>$this->formatStr($personArr[1][0]),
'phone'=>$this->formatStr($phoneArr[1][0]),
'mobile'=>$this->formatStr($mobileArr[1][0]),
'qq'=>$this->formatStr($qqArr[1][0]),
'msn'=>$this->formatStr($msnArr[1][0]),
'fax'=>$this->formatStr($faxArr[1][0]),
'email'=>$this->formatStr($emailArr[1][0]),
'address'=>$this->formatStr($addressArr[1][0]),
'country'=>$this->formatStr($countryArr[1][0]),
'region'=>$this->formatStr($regionArr[1][0]),
'zip'=>$this->formatStr($zipArr[1][0]),
'web'=>$this->formatStr($webArr[1][0]),
'shopUrl'=>$url
);
return $infoArr;
}
/**
* 根據頁面獲取供應商 url 數組
* @param string $re
* @return ArrayObject
*/
private function shopUrlMatchReArr($re) {
preg_match_all('/<p class="Company"><a.* href=\"(.+)\".*>[<font color="#FF0000">]*.*[<\/font>]*<\/a>\s*<\/p>/Usi', $re, $arr);
$arr = $this->formatUrlArr(array_unique($arr[1]));
return $arr;
}
/**
* 格式化數組
* @param Array $arr
* @return ArrayObject
*/
private function formatUrlArr($arr) {
$newArr = array();
foreach ($arr as $key=>$value) {
if ($this->isExistsHttp($value)) {
$newArr[$key] = $value;
}
}
return $newArr;
}
/**
* 格式化 QQ
* @param string $str
* @return string
*/
private function formatQqMsn($str, $e='QQ') {
if (emptyempty($str)) return '';
preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr);
if (count($arr[1])==1) return $arr[1][0];
$newStr = null;
foreach ($arr[1] as $value) {
$newStr .= $value . ' ';
}
return rtrim($newStr, ' ');
}
/**
* 供應商店鋪鏈接添加 contact.asp
* @param array $arr
* @return string
*/
private function shopAddContact($arr) {
foreach ($arr as $k=>$v) {
if (stristr($v, 'contact.asp')===FALSE)
$newArr[$k] = $this->addContact($v);
else
$newArr[$k] = $v;
}
return $newArr;
}
/**
* 鏈接添加 contact.asp
* @param string $str
* @return string
*/
private function addContact($str) {
return $str . '/contact.asp';
}
/**
* 去掉網址的 A 標簽
* @param string $site
* @return string
*/
private function stripATags($site) {
$site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);
return $site;
}
/**
* 檢查 url 是否有 http
* @param string $url
* @return bool
*/
private function isExistsHttp($url) {
if (stristr($url, 'http://')) {
return true;
} else {
return false;
}
}
/**
* 獲取頁面內容
* @param Number $page
* @return string
*/
private function getContent($page=1) {
$re = file_get_contents($this->getUrl($this->key, $page));
return $re;
}
/**
* 獲取頁碼
* @return Number
*/
private function getPageNum() {
preg_match_all('/共.*條記錄分(.*)頁顯示/Usi', $this->getContent(), $arr);
return $arr[1][0];
}
/**
* 獲取 URL 鏈接
* @param string $str
* @param int $page 頁碼
* @return string
*/
private function getUrl($str, $page=1) {
return "http://www.ic37.com/sell/search.asp?keyword={$str}&x=86&y=22&page={$page}";
}
/**
* 獲取頁面內容
* @param string $url
* @return string
*/
private function getUrlInfo($url) {
$re = file_get_contents($url);
return $re;
}
}
/*
程序運行思路:根據“中國 IC 網”的IC搜索功能,輸入型號進行搜索,然後抓取供應商信息
數據庫結構
CREATE TABLE `ic37` (
`id` mediumint(8) unsigned NOT NULL auto_increment,
`company` varchar(500) default NULL,
`person` varchar(500) default NULL,
`phone` varchar(500) default NULL,
`mobile` varchar(500) default NULL,
`qq` varchar(500) default NULL,
`msn` varchar(500) default NULL,
`fax` varchar(500) default NULL,
`email` varchar(500) default NULL,
`address` varchar(1000) default NULL,
`country` varchar(500) default NULL,
`region` varchar(500) default NULL,
`zip` varchar(500) default NULL,
`web` varchar(500) default NULL,
`shopUrl` varchar(500) default NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=gb2312
*/
$k = new ic37();
$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358', 'NE555', '78L05', 'LM324', 'TL431', 'PC817', '7805', 'LM339', 'LM317', '46A-3GRI', 'MODEL', '78L05', '93C46-3GRI', '8050', 'DS18B20', 'TDA2030', 'LM393', '74HC595', '6N137', 'SN75176BDR'));
foreach ($arr as $v) {
$k->go($v);
}
?>
<?php
/**
* 抓取“中國 IC 網(http://www.ic37.com)”供應商主程序
* author Lee.
* Last modify $Date: 2012-2-9 9:32:21 $
* 注:本程序按照編碼 GB2312 執行,因為“中國 IC 網”網站是GB2312編碼,數據庫也得保持一致
*/
class ic37 {
private $key; // 型號
private $pageNum; // 頁碼
/**
* 入口程序
*/
public function go($key) {
$this->key = $key;
$this->pageNum = $this->getPageNum();
$this->getInfo();
}
/**
* 獲取供應商 url 鏈接數組
* @return ArrayObject
*/
private function getInfo() {
if ($this->pageNum==1) { # 處理只有一頁的情況
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
$this->isAddSuccess($arr);
} elseif ($this->pageNum>1) { # 多頁
for ($i=1; $i<=$this->pageNum; $i++) {
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
$this->isAddSuccess($arr);
}
}
}
/**
* 打印是否添加成功
* @param ArrayObject $arr
* @return string
*/
private function isAddSuccess($arr) {
foreach ($arr as $k=>$v) {
if ($this->execAdd($this->getInfoByShopUrl($v))) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}
/**
* 執行添加到數據庫
* @param ArrayObject $infoArr
* @return Number 受影響的行數
*/
private function execAdd($infoArr) {
$mysqli = $this->getDb();
if (!empty($infoArr['company'])) {
if (!$this->isExists($mysqli, $infoArr)) {
$num = $mysqli->query("INSERT INTO ic37(company,person,phone,mobile,qq,msn,fax,email,address,country,region,zip,web,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['person']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['fax']}','{$infoArr['email']}','{$infoArr['address']}','{$infoArr['country']}','{$infoArr['region']}','{$infoArr['zip']}','{$infoArr['web']}','{$infoArr['shopUrl']}')");
return $num;
} else {
return false; # 表示數據已經存在
}
} else {
return false;
}
}
private function formatStr($str) {
$str = trim($str);
$str = str_replace(' ', '', $str);
$str = str_replace('==聯系我們', '', $str);
return $str;
}
/**
* 連接數據庫
*/
private function getDb() {
$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');
$mysqli->query('SET NAMES GB2312');
return $mysqli;
}
/**
* 檢查公司是否已經存在
* @param Resource $mysqli
* @param ArrayObject $infoArr
* @return bool
*/
private function isExists($mysqli, $infoArr) {
$mysqli->query("SELECT company FROM weiku WHERE company = '{$infoArr['company']}'");
if ($mysqli->affected_rows) {
return true;
} else {
return false;
}
}
/**
* 抓取信息
* @param $url
* @return ArrayObject
*/
private function getInfoByShopUrl($url) {
$re = preg_replace('/<a.+>(.*)<\/a>/', '\1', str_replace('</font>', '', str_replace('<font color="#000099">', '', $this->getUrlInfo($url))));
preg_match_all('/<title>(.*)<\/title>/Usi', $re, $companyArr);
preg_match_all('/<strong>聯系人:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $personArr);
preg_match_all('/<strong>電話:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $phoneArr);
preg_match_all('/<strong>手機:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $mobileArr);
preg_match_all('/<strong>QQ:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $qqArr);
preg_match_all('/<strong>MSN:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $msnArr);
preg_match_all('/<strong>傳真:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $faxArr);
preg_match_all('/<strong>EMail:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $emailArr);
preg_match_all('/司地址[:]*[<\/strong>]*[<strong>]*[:]*[<\/strong>]*<\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $addressArr);
preg_match_all('/<strong>國家[:]*<\/strong>[<strong>]*[:]*[<\/strong>]*<\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $countryArr);
preg_match_all('/<strong>地區:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $regionArr);
preg_match_all('/<strong>郵政編碼:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $zipArr);
preg_match_all('/<strong>\s*網址[1]*:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $webArr);
$infoArr = array(
'company'=>$this->formatStr($companyArr[1][0]),
'person'=>$this->formatStr($personArr[1][0]),
'phone'=>$this->formatStr($phoneArr[1][0]),
'mobile'=>$this->formatStr($mobileArr[1][0]),
'qq'=>$this->formatStr($qqArr[1][0]),
'msn'=>$this->formatStr($msnArr[1][0]),
'fax'=>$this->formatStr($faxArr[1][0]),
'email'=>$this->formatStr($emailArr[1][0]),
'address'=>$this->formatStr($addressArr[1][0]),
'country'=>$this->formatStr($countryArr[1][0]),
'region'=>$this->formatStr($regionArr[1][0]),
'zip'=>$this->formatStr($zipArr[1][0]),
'web'=>$this->formatStr($webArr[1][0]),
'shopUrl'=>$url
);
return $infoArr;
}
/**
* 根據頁面獲取供應商 url 數組
* @param string $re
* @return ArrayObject
*/
private function shopUrlMatchReArr($re) {
preg_match_all('/<p class="Company"><a.* href=\"(.+)\".*>[<font color="#FF0000">]*.*[<\/font>]*<\/a>\s*<\/p>/Usi', $re, $arr);
$arr = $this->formatUrlArr(array_unique($arr[1]));
return $arr;
}
/**
* 格式化數組
* @param Array $arr
* @return ArrayObject
*/
private function formatUrlArr($arr) {
$newArr = array();
foreach ($arr as $key=>$value) {
if ($this->isExistsHttp($value)) {
$newArr[$key] = $value;
}
}
return $newArr;
}
/**
* 格式化 QQ
* @param string $str
* @return string
*/
private function formatQqMsn($str, $e='QQ') {
if (empty($str)) return '';
preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr);
if (count($arr[1])==1) return $arr[1][0];
$newStr = null;
foreach ($arr[1] as $value) {
$newStr .= $value . ' ';
}
return rtrim($newStr, ' ');
}
/**
* 供應商店鋪鏈接添加 contact.asp
* @param array $arr
* @return string
*/
private function shopAddContact($arr) {
foreach ($arr as $k=>$v) {
if (stristr($v, 'contact.asp')===FALSE)
$newArr[$k] = $this->addContact($v);
else
$newArr[$k] = $v;
}
return $newArr;
}
/**
* 鏈接添加 contact.asp
* @param string $str
* @return string
*/
private function addContact($str) {
return $str . '/contact.asp';
}
/**
* 去掉網址的 A 標簽
* @param string $site
* @return string
*/
private function stripATags($site) {
$site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);
return $site;
}
/**
* 檢查 url 是否有 http
* @param string $url
* @return bool
*/
private function isExistsHttp($url) {
if (stristr($url, 'http://')) {
return true;
} else {
return false;
}
}
/**
* 獲取頁面內容
* @param Number $page
* @return string
*/
private function getContent($page=1) {
$re = file_get_contents($this->getUrl($this->key, $page));
return $re;
}
/**
* 獲取頁碼
* @return Number
*/
private function getPageNum() {
preg_match_all('/共.*條記錄分(.*)頁顯示/Usi', $this->getContent(), $arr);
return $arr[1][0];
}
/**
* 獲取 URL 鏈接
* @param string $str
* @param int $page 頁碼
* @return string
*/
private function getUrl($str, $page=1) {
return "http://www.ic37.com/sell/search.asp?keyword={$str}&x=86&y=22&page={$page}";
}
/**
* 獲取頁面內容
* @param string $url
* @return string
*/
private function getUrlInfo($url) {
$re = file_get_contents($url);
return $re;
}
}
/*
程序運行思路:根據“中國 IC 網”的IC搜索功能,輸入型號進行搜索,然後抓取供應商信息
數據庫結構
CREATE TABLE `ic37` (
`id` mediumint(8) unsigned NOT NULL auto_increment,
`company` varchar(500) default NULL,
`person` varchar(500) default NULL,
`phone` varchar(500) default NULL,
`mobile` varchar(500) default NULL,
`qq` varchar(500) default NULL,
`msn` varchar(500) default NULL,
`fax` varchar(500) default NULL,
`email` varchar(500) default NULL,
`address` varchar(1000) default NULL,
`country` varchar(500) default NULL,
`region` varchar(500) default NULL,
`zip` varchar(500) default NULL,
`web` varchar(500) default NULL,
`shopUrl` varchar(500) default NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=gb2312
*/
$k = new ic37();
$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358', 'NE555', '78L05', 'LM324', 'TL431', 'PC817', '7805', 'LM339', 'LM317', '46A-3GRI', 'MODEL', '78L05', '93C46-3GRI', '8050', 'DS18B20', 'TDA2030', 'LM393', '74HC595', '6N137', 'SN75176BDR'));
foreach ($arr as $v) {
$k->go($v);
}
?>