抓取“全球 IC 采购网”供应商程序
2837 点击·0 回帖
![]() | ![]() | |
![]() | <?php /** * 抓取“全球 IC 采购网(http://www.qic.com.cn/)”供应商主程序 * author Lee. * Last modify $Date: 2012-2-7 09:35:21 $ */ require_once './config.inc.php'; class qic{ private $startId; private $endId; public function __construct() { $this->startId = 27688; $this->endId = 55185; } public function go() { for ($i=$this->startId; $i<=$this->endId; $i++) { $infoArr = $this->getInfoByUrl($this->getUrl($i)); if (emptyempty($infoArr['company'])) continue; $m = new Model(); if ($m->isExists('qic', "company='{$infoArr['company']}'")) { echo 'Data Exists!!'; continue; } else { if ($this->addInfoInDB($m, $infoArr)) { echo 'Add Success!!'; } else { echo 'Add Faild!!'; } } } } /** * 添加数据进数据库 * @param Object $m * @param array $infoArr * @return Number */ private function addInfoInDB($m ,$infoArr) { $num = null; $num = $m->insert('qic', array('company', 'person', 'phone', 'mobile', 'fax', 'qq', 'msn', 'email', 'address', 'website', 'shopUrl'), array($infoArr['company'], $infoArr['person'], $infoArr['phone'], $infoArr['mobile'], $infoArr['fax'], $infoArr['qq'], $infoArr['msn'], $infoArr['email'], $infoArr['address'], $infoArr['website'], $infoArr['shopUrl'])); return $num; } /** * 根据供应商地址获取信息 * @param string $re * @return ArrayObject */ private function getInfoByUrl($url) { $re = file_get_contents($url); preg_match_all('/<div class=\"gs\-font\">(.*)<\/div>/Usi', $re, $companyArr); preg_match_all('/<li>\s*联 系 人:(.*)<\/li>/Usi', $re, $personArr); preg_match_all('/<li>\s*电 话:(.*)<\/li>/Usi', $re, $phoneArr); preg_match_all('/<li>\s*手 机:(.*)<\/li>/Usi', $re, $mobileArr); preg_match_all('/<li>\s*传 真:(.*)<\/li>/Usi', $re, $faxArr); preg_match_all('/<li>\s*QQ:(.*)<\/li>/Usi', $re, $qqArr); preg_match_all('/<li>\s*MSN:(.*)<\/li>/Usi', $re, $msnArr); preg_match_all('/<li>\s*邮 箱:(.*)<\/li>/Usi', $re, $emailArr); preg_match_all('/公司地址:(.*)<\/li>/Usi', $re, $addressArr); preg_match_all('/公司网址:(.*)<\/li>/Usi', $re, $websiteArr); $infoArr = array( 'company'=>$this->formatString($companyArr[1][0]), 'person'=>$this->formatString($personArr[1][0]), 'phone'=>$this->formatString($phoneArr[1][0]), 'mobile'=>$this->formatString($mobileArr[1][0]), 'fax'=>$this->formatString($faxArr[1][0]), 'qq'=>$this->formatString($qqArr[1][0], 'qm'), 'msn'=>$this->formatString($msnArr[1][0], 'qm'), 'email'=>$this->formatString($emailArr[1][0]), 'address'=>$this->formatString($addressArr[1][0]), 'website'=>$this->formatString($websiteArr[1][0], 'a'), 'shopUrl'=>$url ); return $infoArr; } /** * 获取页面内容 * @param string $url * @return string */ private function getContent($url) { $re = file_get_contents($url); return $re; } /** * 格式化字符串 * @param string $str * @param string $type 类型 * @return string */ private function formatString($str, $type='default') { $str = trim($str); switch ($type) { case 'default': return emptyempty($str) ? '' : $str; break; case 'qm': # 处理QQ if (emptyempty($str)) return ''; preg_match_all('/\'(.+)\'/si', $str, $arr); return trim($arr[1][0]); break; case 'a': $str = preg_replace('/<a.+>(.+)<\/a>/', '\1', $str); return $str; break; default: return ''; break; } } /** * 获取供应商页面地址 www.atcpu.com * @param int $shopId * @return string */ private function getUrl($shopId) { return "http://www.qic.com.cn/specialstore/tsh_{$shopId}.html"; } } /* * 表结构 CREATE TABLE `qic` ( `id` mediumint(8) unsigned NOT NULL auto_increment, `company` varchar(500) NOT NULL, `person` varchar(500) NOT NULL, `phone` varchar(300) NOT NULL, `mobile` varchar(300) NOT NULL, `fax` varchar(300) NOT NULL, `qq` varchar(300) NOT NULL, `msn` varchar(500) NOT NULL, `email` varchar(500) NOT NULL, `address` varchar(500) NOT NULL, `website` varchar(500) NOT NULL, `shopUrl` varchar(200) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 * */ $q = new qic(); $q->go(); ?> | |
![]() | ![]() |