电影数据的自动抓取
我们知道互联网上的数据越来越多,如何找到那些数据是一个问题,这个问题被搜索引擎部分解决了。找到数据了,如何抓取我们看到的数据并进行分析,这个问题就需要我们自己去解决。一般情况下我们业界采用的都是爬虫抓取数据,然后进行数据过滤,数据清洗,数据分析,数据呈现。
今天我要介绍的是如何利用php进行电影(猫眼电影)的自动抓取。
自动爬虫抓取,一看就是主要两部分,一部分是自动,一部分是抓取。
如何自动,我们采用的是linux的crontab做定时任务,每天定时运行php脚本即可。
比如:我们在/etc/crontab里面填写如下内容
00 00 1 * * root /root/scripts/month_scrapy_movie
其中前面的5个数字分别表示分钟 小时 天 月 星期,我这里在第三个位置设置为1就表示每天运行一次,其它的同理,比如在第二个位置设置为1就表示每小时执行一次脚本。
后面的/root/scripts/month_scrapy_movie是要执行的脚本,我们一般使用shell脚本。
里面的内容一般是这样的:
php index.php scrapy/getGrail
这里我们使用的是thinkphp框架,所以这里就是访问scrapy控制器的getGrail的方法,这个是thinkphp的命令行访问方法。
接下来我们介绍下如何进行数据抓取。
代码如下:
<?php
/**
* Created by PhpStorm.
* User: 梦回故里
* Date: 2018/9/9
* Time: 21:43.
*/
namespace Home\Controller;
use Think\Controller;
class ScrapyController extends Controller {
private $cookie = 'bid=XfjM2SvLv-A; ll="118137"; _vwo_uuid_v2=6FA9AE4C26C1CE720E21D6563D466BA9|2679b96ecbc89a60e4a31ca967336c2a; _ck_desktop_mode=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1464597665%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Ds0xD8d-Okh9c1zxfZrxta3gu-cvOjUuQxBginwzpNDNGkJcGRIfiid3XMlmCEEksqvWYYgPN2AdfNueKnvGdu_sNcejBvikPypwyr8zWXBq%26wd%3D%26eqid%3Daa28a16100afcd5100000002574bae86%22%5D; as="https://movie.douban.com/subject/6873736/?from=showing"; ps=y; push_noty_num=1; push_doumail_num=0; __utma=30149280.1803998130.1464072318.1464592219.1464595569.8; __utmc=30149280; __utmz=30149280.1464583525.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.13850; __utma=223695111.582615646.1464105735.1464595569.1464597665.5; __utmc=223695111; __utmz=223695111.1464577681.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=83fef4e30ba0682b.1464105734.5.1464598089.1464595658.; ap=1';
private $isPost = false;
private $curlPost = array();
private $userAgent = 'Mozilla/5.0 (Linux; Android 5.0.2; Redmi Note 3 Build/LRX22G; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/45.0.2454.95 Mobile Safari/537.36 AliApp(DY/6.0.0) TBMovie/6.0.0 1080X1920';
private $url = '';
private $header = array();
private $referer = '';
private function ratingstar($string) {
if ($string == 'allstar10') {
return 1;
} elseif ($string == 'allstar20') {
return 2;
} elseif ($string == 'allstar30') {
return 3;
} elseif ($string == 'allstar40') {
return 4;
} elseif ($string == 'allstar50') {
return 5;
} else {
return 0;
}
}
private function director_string($directors) {
$director = '';
foreach ($directors as $key => $val) {
$director .= $val->name . ',';
}
$director = trim($director, ',');
return $director;
}
private function actor_string($casts) {
$cast = '';
foreach ($casts as $key => $val) {
$cast .= $val->name . ',';
}
$cast = trim($cast, ',');
return $cast;
}
private function type_string($types) {
$type = '';
foreach ($types as $key => $val) {
$type .= $val . ',';
}
$type = trim($type, ',');
return $type;
}
private function region_string($countries) {
$country = '';
foreach ($countries as $key => $val) {
$country .= $val . ',';
}
$country = trim($country, ',');
return $country;
}
/**
* @return string
*/
public function getCookie() {
return $this->cookie;
}
/**
* @param string $cookie
*/
public function setCookie($cookie) {
$this->cookie = $cookie;
}
/**
* @return bool
*/
public function isPost() {
return $this->isPost;
}
/**
* @param bool $isPost
*/
public function setIsPost($isPost) {
$this->isPost = $isPost;
}
/**
* @return array
*/
public function getCurlPost() {
return $this->curlPost;
}
/**
* @param array $curlPost
*/
public function setCurlPost($curlPost) {
$this->curlPost = $curlPost;
}
/**
* @return string
*/
public function getUserAgent() {
return $this->userAgent;
}
/**
* @param string $userAgent
*/
public function setUserAgent($userAgent) {
$this->userAgent = $userAgent;
}
/**
* @return string
*/
public function getUrl() {
return $this->url;
}
/**
* @param string $url
*/
public function setUrl($url) {
$this->url = $url;
}
/**
* @return array
*/
public function getHeader() {
return $this->header;
}
/**
* @param array $header
*/
public function setHeader($header) {
$this->header = $header;
}
/**
* @return string
*/
public function getReferer() {
return $this->referer;
}
/**
* @param string $referer
*/
public function setReferer($referer) {
$this->referer = $referer;
}
public function __construct() {
include_once 'simple_html_dom.php';
}
public function getCurl() {
$ch = curl_init();
//$ip = array('101.68.44.61','218.202.111.10','218.202.111.11','218.202.111.12','218.202.101.10','218.202.102.10','218.202.111.10','218.192.101.10','218.192.101.15','112.5.220.199','112.5.220.198','112.5.220.197','112.5.220.196','112.5.220.195','112.5.220.193','112.5.220.192','112.5.220.62');
//$postip = $ip[array_rand($ip,1)];
//curl_setopt($ch, CURLOPT_PROXY, $ip); //代理IP
//curl_setopt($ch, CURLOPT_PROXYPORT, $port); //代理端口
curl_setopt($ch, CURLOPT_COOKIE, $this->cookie);
if ($this->isPost) {
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $this->curlPost);
} else {
curl_setopt($ch, CURLOPT_POST, 0);
}
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
//curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true); // 从证书中检查SSL加密算法是否存在
//curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->header);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_REFERER, $this->referer);
$page_content = curl_exec($ch);
curl_close($ch);
return $page_content;
}
public function getMovieComing() {
$this->referer = 'http://m.maoyan.com';
$this->cookie = 'bid=XfjM2SvLv-A; ll="118137"; _vwo_uuid_v2=6FA9AE4C26C1CE720E21D6563D466BA9|2679b96ecbc89a60e4a31ca967336c2a; _ck_desktop_mode=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1464597665%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Ds0xD8d-Okh9c1zxfZrxta3gu-cvOjUuQxBginwzpNDNGkJcGRIfiid3XMlmCEEksqvWYYgPN2AdfNueKnvGdu_sNcejBvikPypwyr8zWXBq%26wd%3D%26eqid%3Daa28a16100afcd5100000002574bae86%22%5D; as="https://movie.douban.com/subject/6873736/?from=showing"; ps=y; push_noty_num=1; push_doumail_num=0; __utma=30149280.1803998130.1464072318.1464592219.1464595569.8; __utmc=30149280; __utmz=30149280.1464583525.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.13850; __utma=223695111.582615646.1464105735.1464595569.1464597665.5; __utmc=223695111; __utmz=223695111.1464577681.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=83fef4e30ba0682b.1464105734.5.1464598089.1464595658.; ap=1';
$this->userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1';
$this->url = 'http://m.maoyan.com/ajax/comingList?ci=1&token=&limit=1000';
$info = $this->getCurl();
$info = str_replace('\\"', "'", $info);
$obj = json_decode($info);
$movies = $obj->coming;
$i = 1;
D('movie_coming')->save(array('orderid' => 0));
foreach ($movies as $k => $v) {
$mid = $v->id;
$movie_coming = D('movie_coming')->where(array('mid' => $mid))->find();
if ($movie_coming) {
$moviecoming = D('movie_coming');
$moviecoming->mid = $mid;
$moviecoming->scm = '';
$moviecoming->showinfo = isset($v->showInfo) ? $v->showInfo : '';
$moviecoming->rt = $v->rt;
$moviecoming->dir = '';
//$moviecoming->star = $v->star;
$moviecoming->cat = '';
$moviecoming->wish = $v->wish;
$moviecoming->nm = $v->nm;
$moviecoming->poster = $v->img;
$moviecoming->dur = '';
$moviecoming->orderid = $i;
$moviecoming->save();
} else {
$moviecoming = D('movie_coming');
$moviecoming->mid = $v->id;
$moviecoming->scm = '';
$moviecoming->showinfo = isset($v->showInfo) ? $v->showInfo : '';
$moviecoming->rt = $v->rt;
$moviecoming->dir = '';
//$moviecoming->star = $v->star;
$moviecoming->cat = '';
$moviecoming->wish = $v->wish;
$moviecoming->nm = $v->nm;
$moviecoming->poster = $v->img;
$moviecoming->dur = '';
$moviecoming->orderid = $i;
$moviecoming->add();
}
++$i;
}
}
}
这里面主要就是采用curl函数抓取猫眼电影。这里的api地址是:
http://m.maoyan.com/ajax/comingList?ci=1&token=&limit=1000
我们可以看到这是个json数据格式,而php是最擅长处理字符串的语言。
$obj = json_decode($info);
一个json_decode 就可以转换成对象形式,接下来就是数据的存储了。
梦回故里整理。



评论已关闭!