电影数据的自动抓取
我们知道互联网上的数据越来越多,如何找到那些数据是一个问题,这个问题被搜索引擎部分解决了。找到数据了,如何抓取我们看到的数据并进行分析,这个问题就需要我们自己去解决。一般情况下我们业界采用的都是爬虫抓取数据,然后进行数据过滤,数据清洗,数据分析,数据呈现。
今天我要介绍的是如何利用php进行电影(猫眼电影)的自动抓取。
自动爬虫抓取,一看就是主要两部分,一部分是自动,一部分是抓取。
如何自动,我们采用的是linux的crontab做定时任务,每天定时运行php脚本即可。
比如:我们在/etc/crontab里面填写如下内容
00 00 1 * * root /root/scripts/month_scrapy_movie
其中前面的5个数字分别表示分钟 小时 天 月 星期,我这里在第三个位置设置为1就表示每天运行一次,其它的同理,比如在第二个位置设置为1就表示每小时执行一次脚本。
后面的/root/scripts/month_scrapy_movie是要执行的脚本,我们一般使用shell脚本。
里面的内容一般是这样的:
php index.php scrapy/getGrail
这里我们使用的是thinkphp框架,所以这里就是访问scrapy控制器的getGrail的方法,这个是thinkphp的命令行访问方法。
接下来我们介绍下如何进行数据抓取。
代码如下:
<?php /** * Created by PhpStorm. * User: 梦回故里 * Date: 2018/9/9 * Time: 21:43. */ namespace Home\Controller; use Think\Controller; class ScrapyController extends Controller { private $cookie = 'bid=XfjM2SvLv-A; ll="118137"; _vwo_uuid_v2=6FA9AE4C26C1CE720E21D6563D466BA9|2679b96ecbc89a60e4a31ca967336c2a; _ck_desktop_mode=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1464597665%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Ds0xD8d-Okh9c1zxfZrxta3gu-cvOjUuQxBginwzpNDNGkJcGRIfiid3XMlmCEEksqvWYYgPN2AdfNueKnvGdu_sNcejBvikPypwyr8zWXBq%26wd%3D%26eqid%3Daa28a16100afcd5100000002574bae86%22%5D; as="https://movie.douban.com/subject/6873736/?from=showing"; ps=y; push_noty_num=1; push_doumail_num=0; __utma=30149280.1803998130.1464072318.1464592219.1464595569.8; __utmc=30149280; __utmz=30149280.1464583525.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.13850; __utma=223695111.582615646.1464105735.1464595569.1464597665.5; __utmc=223695111; __utmz=223695111.1464577681.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=83fef4e30ba0682b.1464105734.5.1464598089.1464595658.; ap=1'; private $isPost = false; private $curlPost = array(); private $userAgent = 'Mozilla/5.0 (Linux; Android 5.0.2; Redmi Note 3 Build/LRX22G; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/45.0.2454.95 Mobile Safari/537.36 AliApp(DY/6.0.0) TBMovie/6.0.0 1080X1920'; private $url = ''; private $header = array(); private $referer = ''; private function ratingstar($string) { if ($string == 'allstar10') { return 1; } elseif ($string == 'allstar20') { return 2; } elseif ($string == 'allstar30') { return 3; } elseif ($string == 'allstar40') { return 4; } elseif ($string == 'allstar50') { return 5; } else { return 0; } } private function director_string($directors) { $director = ''; foreach ($directors as $key => $val) { $director .= $val->name . ','; } $director = trim($director, ','); return $director; } private function actor_string($casts) { $cast = ''; foreach ($casts as $key => $val) { $cast .= $val->name . ','; } $cast = trim($cast, ','); return $cast; } private function type_string($types) { $type = ''; foreach ($types as $key => $val) { $type .= $val . ','; } $type = trim($type, ','); return $type; } private function region_string($countries) { $country = ''; foreach ($countries as $key => $val) { $country .= $val . ','; } $country = trim($country, ','); return $country; } /** * @return string */ public function getCookie() { return $this->cookie; } /** * @param string $cookie */ public function setCookie($cookie) { $this->cookie = $cookie; } /** * @return bool */ public function isPost() { return $this->isPost; } /** * @param bool $isPost */ public function setIsPost($isPost) { $this->isPost = $isPost; } /** * @return array */ public function getCurlPost() { return $this->curlPost; } /** * @param array $curlPost */ public function setCurlPost($curlPost) { $this->curlPost = $curlPost; } /** * @return string */ public function getUserAgent() { return $this->userAgent; } /** * @param string $userAgent */ public function setUserAgent($userAgent) { $this->userAgent = $userAgent; } /** * @return string */ public function getUrl() { return $this->url; } /** * @param string $url */ public function setUrl($url) { $this->url = $url; } /** * @return array */ public function getHeader() { return $this->header; } /** * @param array $header */ public function setHeader($header) { $this->header = $header; } /** * @return string */ public function getReferer() { return $this->referer; } /** * @param string $referer */ public function setReferer($referer) { $this->referer = $referer; } public function __construct() { include_once 'simple_html_dom.php'; } public function getCurl() { $ch = curl_init(); //$ip = array('101.68.44.61','218.202.111.10','218.202.111.11','218.202.111.12','218.202.101.10','218.202.102.10','218.202.111.10','218.192.101.10','218.192.101.15','112.5.220.199','112.5.220.198','112.5.220.197','112.5.220.196','112.5.220.195','112.5.220.193','112.5.220.192','112.5.220.62'); //$postip = $ip[array_rand($ip,1)]; //curl_setopt($ch, CURLOPT_PROXY, $ip); //代理IP //curl_setopt($ch, CURLOPT_PROXYPORT, $port); //代理端口 curl_setopt($ch, CURLOPT_COOKIE, $this->cookie); if ($this->isPost) { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $this->curlPost); } else { curl_setopt($ch, CURLOPT_POST, 0); } curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查 //curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true); // 从证书中检查SSL加密算法是否存在 //curl_setopt($ch, CURLOPT_HEADER, 1); curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent); curl_setopt($ch, CURLOPT_URL, $this->url); curl_setopt($ch, CURLOPT_HTTPHEADER, $this->header); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_REFERER, $this->referer); $page_content = curl_exec($ch); curl_close($ch); return $page_content; } public function getMovieComing() { $this->referer = 'http://m.maoyan.com'; $this->cookie = 'bid=XfjM2SvLv-A; ll="118137"; _vwo_uuid_v2=6FA9AE4C26C1CE720E21D6563D466BA9|2679b96ecbc89a60e4a31ca967336c2a; _ck_desktop_mode=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1464597665%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Ds0xD8d-Okh9c1zxfZrxta3gu-cvOjUuQxBginwzpNDNGkJcGRIfiid3XMlmCEEksqvWYYgPN2AdfNueKnvGdu_sNcejBvikPypwyr8zWXBq%26wd%3D%26eqid%3Daa28a16100afcd5100000002574bae86%22%5D; as="https://movie.douban.com/subject/6873736/?from=showing"; ps=y; push_noty_num=1; push_doumail_num=0; __utma=30149280.1803998130.1464072318.1464592219.1464595569.8; __utmc=30149280; __utmz=30149280.1464583525.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.13850; __utma=223695111.582615646.1464105735.1464595569.1464597665.5; __utmc=223695111; __utmz=223695111.1464577681.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=83fef4e30ba0682b.1464105734.5.1464598089.1464595658.; ap=1'; $this->userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'; $this->url = 'http://m.maoyan.com/ajax/comingList?ci=1&token=&limit=1000'; $info = $this->getCurl(); $info = str_replace('\\"', "'", $info); $obj = json_decode($info); $movies = $obj->coming; $i = 1; D('movie_coming')->save(array('orderid' => 0)); foreach ($movies as $k => $v) { $mid = $v->id; $movie_coming = D('movie_coming')->where(array('mid' => $mid))->find(); if ($movie_coming) { $moviecoming = D('movie_coming'); $moviecoming->mid = $mid; $moviecoming->scm = ''; $moviecoming->showinfo = isset($v->showInfo) ? $v->showInfo : ''; $moviecoming->rt = $v->rt; $moviecoming->dir = ''; //$moviecoming->star = $v->star; $moviecoming->cat = ''; $moviecoming->wish = $v->wish; $moviecoming->nm = $v->nm; $moviecoming->poster = $v->img; $moviecoming->dur = ''; $moviecoming->orderid = $i; $moviecoming->save(); } else { $moviecoming = D('movie_coming'); $moviecoming->mid = $v->id; $moviecoming->scm = ''; $moviecoming->showinfo = isset($v->showInfo) ? $v->showInfo : ''; $moviecoming->rt = $v->rt; $moviecoming->dir = ''; //$moviecoming->star = $v->star; $moviecoming->cat = ''; $moviecoming->wish = $v->wish; $moviecoming->nm = $v->nm; $moviecoming->poster = $v->img; $moviecoming->dur = ''; $moviecoming->orderid = $i; $moviecoming->add(); } ++$i; } } }
这里面主要就是采用curl函数抓取猫眼电影。这里的api地址是:
http://m.maoyan.com/ajax/comingList?ci=1&token=&limit=1000
我们可以看到这是个json数据格式,而php是最擅长处理字符串的语言。
$obj = json_decode($info);
一个json_decode 就可以转换成对象形式,接下来就是数据的存储了。
梦回故里整理。
评论已关闭!