2天的采集学习 记录下
目标网站 www.cupshe.com
需求分析 采集目标站点商品主图,名称,价格,以及采集时间
先建立cupshe的库
目标站点全部商品的网址 www.cupshe.com/collections/all?page=1&sort_by=best-selling //sort_by=best-selling的意思是按照销量排序
分析 商品a标签的href中 可以和商品详情页面的product组成商品地址
那我们就先采集href中的商品详情地址
先建立表
表名product_urls
CREATE TABLE `product_urls` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT, `url` varchar(355) DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE=MyISAM AUTO_INCREMENT=49 DEFAULT CHARSET=utf8;query($sql); /* try{ $pdo=new PDOException("mysql:host=localhost;dbname=cupshe","root",""); var_dump($pdo); }catch(PDOException $e){ echo '数据库连接失败'.$e->getMessage(); } $res = $pdo->query("SELECT * FROM product_variants"); var_dump($res); die; */ //var_dump($res); //die; $curl = curl_init(); curl_setopt($curl,CURLOPT_URL,$url); //要抓取的URL。在使用curl_init()初始化会话时也可以设置。 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); //TRUE将转移返回为curl_exec()的返回值的字符串,而不是直接输出。 curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //SSL 报错时使用 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); //SSL 报错时使用 curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查 true检查加密算法是否存在 $res = curl_exec($curl); //print_r($res); curl_close($curl); return $res; } //$url = "https://www.cupshe.com"; //https://www.cupshe.com/collections/all?page=1 //$res = geturl($url); /* * 匹配字符串*/ $conn = new mysqli("localhost","root","","cupshe"); if (!$conn) { die("Connection failed: " . mysqli_connect_error()); } //$sql = "select * from product_urls"; //$res = mysqli_query($conn,$sql); //$res = mysqli_fetch_all($res,MYSQLI_ASSOC); //var_dump($res); //die; $page_max = 35; $i = 1; while($i <= $page_max){ $url = "https://www.cupshe.com"; $url = $url."/collections/all?page={ $i}&sort_by=best-selling"; $res = geturl($url); //$aaa = $url."/collections/all?page={$i}&sort_by=best-selling"; preg_match_all('/href="\/products\/(.+?)"/i', $res, $m); var_dump($m); $arr_products = array_values(array_unique($m[1])); foreach($arr_products as $k=>$v){ $sql ="INSERT INTO product_urls (`url`) VALUES ('{ $v}')"; //echo $sql; $res = mysqli_query($conn,$sql); } $i++; //var_dump($res); //var_dump($arr_products); } mysqli_close($conn); ?>
存储在数据库中然后再新建一个php文件 去采集商品详情页面的html文档 并且用正则匹配 先建表 看需求 CREATE TABLE `product_variants` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `name` text, `price` float(10,2) DEFAULT NULL, `image_src` text, `updated_at` varchar(255) DEFAULT NULL, `create_time` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=56 DEFAULT CHARSET=utf8;
Cupshe Amuse Society Halter Bikini Set //$19.99 USD //21.80 $res = geturl($urls); preg_match('/src="\/\/cdn.shopify.com\/((?!js).)*\d"/i', $res, $images); preg_match('/(.*?)<\/span>/is',$res,$names); preg_match('/(.*?)<\/span>/is',$res,$prices); $time = date("Y-m-dH:i:s"); //var_dump($images); //echo $images[0]; $image = substr($images[0],0,-1); $image = ltrim($image,"src=\""); $sql ="insert into product_variants (name,price,image_src,create_time) values ('{ $names[1]}','{ $prices[1]}','{ $image}','{ $time}')";// echo $sql; $res = mysqli_query($conn,$sql); //var_dump($name); // var_dump($name); /* foreach($m[0] as $val){ $m = substr($val,0,strlen($val)-1); $m = ltrim($m,"src=\""); //var_dump($m); //is_dir('./images/') ? '': mkdir('./images/'); //file_put_contents('./images/'.$val['url'].'.jpg', $m); } foreach($name[1] as $val_name){ var_dump($val_name); } foreach($price[1] as $val_price){ var_dump($val_price); } */ }//var_dump($res);?>
完成之后 又开始写多线程 这是单线程的
多线程不详解 因为目前我也是半懂半不懂的
参考地址http://www.cnblogs.com/loveyouyou616/p/5624139.html
附上代码
$url){ $curl = curl_init(); curl_setopt($curl,CURLOPT_URL,$url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //SSL 报错时使用 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); //SSL 报错时使用 curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查 if (strpos($url,'https')){ curl_setopt ( $curl, CURLOPT_SSL_VERIFYPEER, false ); curl_setopt ( $curl, CURLOPT_SSL_VERIFYHOST, 2 ); } $res = curl_exec($curl); curl_close($curl); preg_match('/src="\/\/cdn.shopify.com\/((?!js).)*\d"/i', $res, $images); preg_match('/(.*?)<\/span>/is',$res,$names); preg_match('/(.*?)<\/span>/is',$res,$prices); $time = date("Y-m-dH:i:s"); //var_dump($images); //echo $images[0]; $image = substr($images[0],0,-1); $image = ltrim($image,"src=\""); $sql ="insert into product_variants (name,price,image_src,create_time) values ('{ $names[1]}','{ $prices[1]}','{ $image}','{ $time}')"; $res = mysqli_query($conn,$sql); var_dump($res);}$end = microtime(true) - $start;echo '';echo $end; //平均19.002983093262s
在说下 不知道为什么这台机器上pdo用不了 只能用mysqli了 第一次用 可能比较菜吧 慢慢学习吧 学习使我快乐