抓取“IC 交易网”供应商程序
4007 点击·0 回帖
![]() | ![]() | |
![]() | <?php /** * 抓取“IC 交易网”供应商主程序 * author Lee. * Last modify $Date: 2012-2-6 10:44:32$ * 注:本程序按照编码 GB2312 执行,因为“IC 交易网”网站是GB2312编码,数据库也得保持一致 */ class ic { private $key; // 型号 private $pageNum; // 页码 /** * 入口程序 */ public function go($key) { $this->key = $key; $this->pageNum = $this->getPageNum(); $this->getInfo(); } /** * 获取供应商 url 链接数组 * @return ArrayObject */ private function getInfo() { if ($this->pageNum==1) { # 处理只有一页的情况 $arr = $this->shopUrlMatchReArr($this->getContent()); $this->isAddSuccess($arr); } elseif ($this->pageNum>1) { # 多页 for ($i=1; $i<=$this->pageNum; $i++) { $arr = $this->shopUrlMatchReArr($this->getContent($i)); $this->isAddSuccess($arr); } } } /** * 打印是否添加成功 * @param ArrayObject $arr * @return string */ private function isAddSuccess($arr) { foreach ($arr as $k=>$v) { if ($this->execadd($this->getInfoByShopUrl($v))) { echo 'Add Success!!'; } else { echo 'Add Faild!!'; } } } /** * 执行添加到数据库 * @param ArrayObject $infoArr * @return Number 受影响的行数 */ private function execAdd($infoArr) { $mysqli = $this->getDb(); if (!emptyempty($infoArr['company'])) { if (!$this->isExists($mysqli, $infoArr)) { $num = $mysqli->query("INSERT INTO ic(company,address,phone,mobile,fax,zip,person,qq,msn,email,website,regDate,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['address']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['fax']}','{$infoArr['zip']}','{$infoArr['person']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['email']}','{$infoArr['website']}','{$infoArr['regDate']}','{$infoArr['shopUrl']}')"); return $num; } else { return false; # 表示数据已经存在 } } else { return false; } } /** * 连接数据库 */ private function getDb() { $mysqli = new mysqli('localhost', 'root', '1715544', 'weiku'); $mysqli->query('SET NAMES GB2312'); return $mysqli; } /** * 检查公司是否已经存在 * @param Resource $mysqli * @param ArrayObject $infoArr * @return bool */ private function isExists($mysqli, $infoArr) { $mysqli->query("SELECT company FROM ic WHERE company = '{$infoArr['company']}'"); if ($mysqli->affected_rows) { return true; } else { return false; } } /** * 格式化字符串 * @param string $str * @return string */ private function formatString($str) { return trim($str); } /** * 抓取信息 * @param $url * @return ArrayObject */ private function getInfoByShopUrl($url) { $re = $this->getUrlInfo($url); if (stristr($re, '<span class="STYLE2">')) $re = preg_replace('/<span class="STYLE2">.*<\/span>/Usi', '', $re); preg_match_all('/<title>(.+)<\/title>/Usi', $re, $companyArr); preg_match_all('/地址:(.*)<\/TD>/Usi', $re, $addressArr); preg_match_all('/电话:(.*)<\/TD>/Usi', $re, $phoneArr); preg_match_all('/手机:(.*)<\/TD>/Usi', $re, $mobileArr); preg_match_all('/传真:(.*)<\/TD>/Usi', $re, $faxArr); preg_match_all('/邮编:(.*)<\/TD>/Usi', $re, $zipArr); preg_match_all('/联系人:(.*)<\/TD>/Usi', $re, $personArr); preg_match_all('/QQ:(.*)<\/TD>/Usi', $re, $qqArr); preg_match_all('/MSN:(.*)<\/TD>/Usi', $re, $msnArr); preg_match_all('/Email:(.*)<\/TD>/Usi', $re, $emailArr); preg_match_all('/网址:(.*)<\/TD>/Usi', $re, $websiteArr); preg_match_all('/注册日期:(.*)<\/TD>/Usi', $re, $regDateArr); $infoArr = array( 'company'=>$this->formatString($companyArr[1][0]), 'address'=>$this->formatString($addressArr[1][0]), 'phone'=>$this->formatString($phoneArr[1][0]), 'mobile'=>$this->formatString($mobileArr[1][0]), 'fax'=>$this->formatString($faxArr[1][0]), 'zip'=>$this->formatString($zipArr[1][0]), 'person'=>$this->formatString($personArr[1][0]), 'qq'=>$this->formatString($qqArr[1][0]), 'msn'=>$this->formatString($msnArr[1][0]), 'email'=>$this->formatString($emailArr[1][0]), 'website'=>$this->stripATags($this->formatString($websiteArr[1][0])), 'regDate'=>$this->formatString($regDateArr[1][0]), 'shopUrl'=>$url ); return $infoArr; } /** * 根据页面获取供应商 url 数组 * @param string $re * @return ArrayObject */ private function shopUrlMatchReArr($re) { preg_match_all('/<a onmousemove=\".+\" onmouSEOut=hidetip\(\) href=(.+) target=\_blank>.+<\/a>/Usi', $re, $arr); $arr = $this->formatUrlArr(array_unique($arr[1])); return $arr; } /** * 格式化数组 * @param Array $arr * @return ArrayObject */ private function formatUrlArr($arr) { $newArr = array(); foreach ($arr as $key=>$value) { if ($this->isExistsHttp($value)) { $newArr[$key] = $value; } } return $newArr; } /** * 格式化 QQ * @param string $str * @return string */ private function formatQqMsn($str, $e='QQ') { if (emptyempty($str)) return ''; preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr); if (count($arr[1])==1) return $arr[1][0]; $newStr = null; foreach ($arr[1] as $value) { $newStr .= $value . ' '; } return rtrim($newStr, ' '); } /** * 去掉网址的 A 标签 * @param string $site * @return string */ private function stripATags($site) { $site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site); return $site; } /** * 检查 url 是否有 http * @param string $url * @return bool */ private function isExistsHttp($url) { if (stristr($url, 'http://')) { return true; } else { return false; } } /** * 获取页面内容 * @param Number $page * @return string */ private function getContent($page=1) { $re = file_get_contents($this->getUrl($this->key, $page)); return $re; } /** * 获取页码 * @return Number */ private function getPageNum() { $i = null; $re = $this->getContent(); preg_match_all('/共(.+)页/Usi', $re, $arr); $i = $arr[1][0]; return $i; } /** * 获取 URL 链接 * @param string $str * @param int $page 页码 * @return string */ private function getUrl($str, $page=1) { return "http://www.ic.net.cn/partsearch/searchinstock.asp?newtype=1;area=;Page={$page};partnumber={$str};mfg=;DateCode=;QTY=;PRICE=;Exact=;orderby=inputdate;qty_filter=50;usertype2=1;pack="; } /** * 获取页面内容 * @param string $url * @return string */ private function getUrlInfo($url) { $re = file_get_contents($url); return $re; } } /* 程序运行思路:根据“IC 交易网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息 数据库结构 CREATE TABLE `ic` ( `id` mediumint(8) unsigned NOT NULL auto_increment, `company` varchar(500) NOT NULL, `address` varchar(500) default NULL, `phone` varchar(500) default NULL, `mobile` varchar(500) default NULL, `fax` varchar(300) default NULL, `zip` varchar(300) default NULL, `person` varchar(500) default NULL, `qq` varchar(300) default NULL, `msn` varchar(300) default NULL, `email` varchar(500) default NULL, `website` varchar(300) default NULL, `regDate` varchar(500) default NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=gb2312 */ $i = new ic(); $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358')); foreach ($arr as $v) { $i->go($v); } ?> <?php /** * 抓取“IC 交易网”供应商主程序 * author Lee. * Last modify $Date: 2012-2-6 10:44:32$ * 注:本程序按照编码 GB2312 执行,因为“IC 交易网”网站是GB2312编码,数据库也得保持一致 */ class ic { private $key; // 型号 private $pageNum; // 页码 /** * 入口程序 */ public function go($key) { $this->key = $key; $this->pageNum = $this->getPageNum(); $this->getInfo(); } /** * 获取供应商 url 链接数组 * @return ArrayObject */ private function getInfo() { if ($this->pageNum==1) { # 处理只有一页的情况 $arr = $this->shopUrlMatchReArr($this->getContent()); $this->isAddSuccess($arr); } elseif ($this->pageNum>1) { # 多页 for ($i=1; $i<=$this->pageNum; $i++) { $arr = $this->shopUrlMatchReArr($this->getContent($i)); $this->isAddSuccess($arr); } } } /** * 打印是否添加成功 * @param ArrayObject $arr * @return string */ private function isAddSuccess($arr) { foreach ($arr as $k=>$v) { if ($this->execAdd($this->getInfoByShopUrl($v))) { echo 'Add Success!!'; } else { echo 'Add Faild!!'; } } } /** * 执行添加到数据库 * @param ArrayObject $infoArr * @return Number 受影响的行数 */ private function execAdd($infoArr) { $mysqli = $this->getDb(); if (!empty($infoArr['company'])) { if (!$this->isExists($mysqli, $infoArr)) { $num = $mysqli->query("INSERT INTO ic(company,address,phone,mobile,fax,zip,person,qq,msn,email,website,regDate,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['address']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['fax']}','{$infoArr['zip']}','{$infoArr['person']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['email']}','{$infoArr['website']}','{$infoArr['regDate']}','{$infoArr['shopUrl']}')"); return $num; } else { return false; # 表示数据已经存在 } } else { return false; } } /** * 连接数据库 */ private function getDb() { $mysqli = new mysqli('localhost', 'root', '1715544', 'weiku'); $mysqli->query('SET NAMES GB2312'); return $mysqli; } /** * 检查公司是否已经存在 * @param Resource $mysqli * @param ArrayObject $infoArr * @return bool */ private function isExists($mysqli, $infoArr) { $mysqli->query("SELECT company FROM ic WHERE company = '{$infoArr['company']}'"); if ($mysqli->affected_rows) { return true; } else { return false; } } /** * 格式化字符串 * @param string $str * @return string */ private function formatString($str) { return trim($str); } /** * 抓取信息 * @param $url * @return ArrayObject */ private function getInfoByShopUrl($url) { $re = $this->getUrlInfo($url); if (stristr($re, '<span class="STYLE2">')) $re = preg_replace('/<span class="STYLE2">.*<\/span>/Usi', '', $re); preg_match_all('/<title>(.+)<\/title>/Usi', $re, $companyArr); preg_match_all('/地址:(.*)<\/TD>/Usi', $re, $addressArr); preg_match_all('/电话:(.*)<\/TD>/Usi', $re, $phoneArr); preg_match_all('/手机:(.*)<\/TD>/Usi', $re, $mobileArr); preg_match_all('/传真:(.*)<\/TD>/Usi', $re, $faxArr); preg_match_all('/邮编:(.*)<\/TD>/Usi', $re, $zipArr); preg_match_all('/联系人:(.*)<\/TD>/Usi', $re, $personArr); preg_match_all('/QQ:(.*)<\/TD>/Usi', $re, $qqArr); preg_match_all('/MSN:(.*)<\/TD>/Usi', $re, $msnArr); preg_match_all('/Email:(.*)<\/TD>/Usi', $re, $emailArr); preg_match_all('/网址:(.*)<\/TD>/Usi', $re, $websiteArr); preg_match_all('/注册日期:(.*)<\/TD>/Usi', $re, $regDateArr); $infoArr = array( 'company'=>$this->formatString($companyArr[1][0]), 'address'=>$this->formatString($addressArr[1][0]), 'phone'=>$this->formatString($phoneArr[1][0]), 'mobile'=>$this->formatString($mobileArr[1][0]), 'fax'=>$this->formatString($faxArr[1][0]), 'zip'=>$this->formatString($zipArr[1][0]), 'person'=>$this->formatString($personArr[1][0]), 'qq'=>$this->formatString($qqArr[1][0]), 'msn'=>$this->formatString($msnArr[1][0]), 'email'=>$this->formatString($emailArr[1][0]), 'website'=>$this->stripATags($this->formatString($websiteArr[1][0])), 'regDate'=>$this->formatString($regDateArr[1][0]), 'shopUrl'=>$url ); return $infoArr; } /** * 根据页面获取供应商 url 数组 * @param string $re * @return ArrayObject */ private function shopUrlMatchReArr($re) { preg_match_all('/<a onmousemove=\".+\" onmouSEOut=hidetip\(\) href=(.+) target=\_blank>.+<\/a>/Usi', $re, $arr); $arr = $this->formatUrlArr(array_unique($arr[1])); return $arr; } /** * 格式化数组 * @param Array $arr * @return ArrayObject */ private function formatUrlArr($arr) { $newArr = array(); foreach ($arr as $key=>$value) { if ($this->isExistsHttp($value)) { $newArr[$key] = $value; } } return $newArr; } /** * 格式化 QQ * @param string $str * @return string */ private function formatQqMsn($str, $e='QQ') { if (empty($str)) return ''; preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr); if (count($arr[1])==1) return $arr[1][0]; $newStr = null; foreach ($arr[1] as $value) { $newStr .= $value . ' '; } return rtrim($newStr, ' '); } /** * 去掉网址的 A 标签 * @param string $site * @return string */ private function stripATags($site) { $site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site); return $site; } /** * 检查 url 是否有 http * @param string $url * @return bool */ private function isExistsHttp($url) { if (stristr($url, 'http://')) { return true; } else { return false; } } /** * 获取页面内容 * @param Number $page * @return string */ private function getContent($page=1) { $re = file_get_contents($this->getUrl($this->key, $page)); return $re; } /** * 获取页码 * @return Number */ private function getPageNum() { $i = null; $re = $this->getContent(); preg_match_all('/共(.+)页/Usi', $re, $arr); $i = $arr[1][0]; return $i; } /** * 获取 URL 链接 * @param string $str * @param int $page 页码 * @return string */ private function getUrl($str, $page=1) { return "http://www.ic.net.cn/partsearch/searchinstock.asp?newtype=1;area=;Page={$page};partnumber={$str};mfg=;DateCode=;QTY=;PRICE=;Exact=;orderby=inputdate;qty_filter=50;usertype2=1;pack="; } /** * 获取页面内容 * @param string $url * @return string */ private function getUrlInfo($url) { $re = file_get_contents($url); return $re; } } /* 程序运行思路:根据“IC 交易网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息 数据库结构 CREATE TABLE `ic` ( `id` mediumint(8) unsigned NOT NULL auto_increment, `company` varchar(500) NOT NULL, `address` varchar(500) default NULL, `phone` varchar(500) default NULL, `mobile` varchar(500) default NULL, `fax` varchar(300) default NULL, `zip` varchar(300) default NULL, `person` varchar(500) default NULL, `qq` varchar(300) default NULL, `msn` varchar(300) default NULL, `email` varchar(500) default NULL, `website` varchar(300) default NULL, `regDate` varchar(500) default NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=gb2312 */ $i = new ic(); $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358')); foreach ($arr as $v) { $i->go($v); } ?> | |
![]() | ![]() |