我们知道互联网上的数据越来越多,如何找到那些数据是一个问题,这个问题被搜索引擎部分解决了。找到数据了,如何抓取我们看到的数据并进行分析,这个问题就需要我们自己去解决。一般情况下我们业界采用的都是爬虫抓取数据,然后进行数据过滤,数据清洗,数据分析,数据呈现。

今天我要介绍的是如何利用php进行电影(猫眼电影)的自动抓取。

自动爬虫抓取,一看就是主要两部分,一部分是自动,一部分是抓取。

如何自动,我们采用的是linux的crontab做定时任务,每天定时运行php脚本即可。

比如:我们在/etc/crontab里面填写如下内容

00 00 1 * * root /root/scripts/month_scrapy_movie

其中前面的5个数字分别表示分钟 小时 天 月 星期,我这里在第三个位置设置为1就表示每天运行一次,其它的同理,比如在第二个位置设置为1就表示每小时执行一次脚本。

后面的/root/scripts/month_scrapy_movie是要执行的脚本,我们一般使用shell脚本。

里面的内容一般是这样的:

php index.php scrapy/getGrail

这里我们使用的是thinkphp框架,所以这里就是访问scrapy控制器的getGrail的方法,这个是thinkphp的命令行访问方法。

接下来我们介绍下如何进行数据抓取。

代码如下:

<?php
/**
 * Created by PhpStorm.
 * User: 梦回故里
 * Date: 2018/9/9
 * Time: 21:43.
 */
namespace Home\Controller;
use Think\Controller;
class ScrapyController extends Controller {
    private $cookie = 'bid=XfjM2SvLv-A; ll="118137"; _vwo_uuid_v2=6FA9AE4C26C1CE720E21D6563D466BA9|2679b96ecbc89a60e4a31ca967336c2a; _ck_desktop_mode=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1464597665%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Ds0xD8d-Okh9c1zxfZrxta3gu-cvOjUuQxBginwzpNDNGkJcGRIfiid3XMlmCEEksqvWYYgPN2AdfNueKnvGdu_sNcejBvikPypwyr8zWXBq%26wd%3D%26eqid%3Daa28a16100afcd5100000002574bae86%22%5D; as="https://movie.douban.com/subject/6873736/?from=showing"; ps=y; push_noty_num=1; push_doumail_num=0; __utma=30149280.1803998130.1464072318.1464592219.1464595569.8; __utmc=30149280; __utmz=30149280.1464583525.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.13850; __utma=223695111.582615646.1464105735.1464595569.1464597665.5; __utmc=223695111; __utmz=223695111.1464577681.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=83fef4e30ba0682b.1464105734.5.1464598089.1464595658.; ap=1';
    private $isPost = false;
    private $curlPost = array();
    private $userAgent = 'Mozilla/5.0 (Linux; Android 5.0.2; Redmi Note 3 Build/LRX22G; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/45.0.2454.95 Mobile Safari/537.36  AliApp(DY/6.0.0) TBMovie/6.0.0 1080X1920';
    private $url = '';
    private $header = array();
    private $referer = '';
    private function ratingstar($string) {
        if ($string == 'allstar10') {
            return 1;
        } elseif ($string == 'allstar20') {
            return 2;
        } elseif ($string == 'allstar30') {
            return 3;
        } elseif ($string == 'allstar40') {
            return 4;
        } elseif ($string == 'allstar50') {
            return 5;
        } else {
            return 0;
        }
    }
    private function director_string($directors) {
        $director = '';
        foreach ($directors as $key => $val) {
            $director .= $val->name . ',';
        }
        $director = trim($director, ',');
        return $director;
    }
    private function actor_string($casts) {
        $cast = '';
        foreach ($casts as $key => $val) {
            $cast .= $val->name . ',';
        }
        $cast = trim($cast, ',');
        return $cast;
    }
    private function type_string($types) {
        $type = '';
        foreach ($types as $key => $val) {
            $type .= $val . ',';
        }
        $type = trim($type, ',');
        return $type;
    }
    private function region_string($countries) {
        $country = '';
        foreach ($countries as $key => $val) {
            $country .= $val . ',';
        }
        $country = trim($country, ',');
        return $country;
    }
    /**
     * @return string
     */
    public function getCookie() {
        return $this->cookie;
    }
    /**
     * @param string $cookie
     */
    public function setCookie($cookie) {
        $this->cookie = $cookie;
    }
    /**
     * @return bool
     */
    public function isPost() {
        return $this->isPost;
    }
    /**
     * @param bool $isPost
     */
    public function setIsPost($isPost) {
        $this->isPost = $isPost;
    }
    /**
     * @return array
     */
    public function getCurlPost() {
        return $this->curlPost;
    }
    /**
     * @param array $curlPost
     */
    public function setCurlPost($curlPost) {
        $this->curlPost = $curlPost;
    }
    /**
     * @return string
     */
    public function getUserAgent() {
        return $this->userAgent;
    }
    /**
     * @param string $userAgent
     */
    public function setUserAgent($userAgent) {
        $this->userAgent = $userAgent;
    }
    /**
     * @return string
     */
    public function getUrl() {
        return $this->url;
    }
    /**
     * @param string $url
     */
    public function setUrl($url) {
        $this->url = $url;
    }
    /**
     * @return array
     */
    public function getHeader() {
        return $this->header;
    }
    /**
     * @param array $header
     */
    public function setHeader($header) {
        $this->header = $header;
    }
    /**
     * @return string
     */
    public function getReferer() {
        return $this->referer;
    }
    /**
     * @param string $referer
     */
    public function setReferer($referer) {
        $this->referer = $referer;
    }
    public function __construct() {
        include_once 'simple_html_dom.php';
    }
    public function getCurl() {
        $ch = curl_init();
        //$ip = array('101.68.44.61','218.202.111.10','218.202.111.11','218.202.111.12','218.202.101.10','218.202.102.10','218.202.111.10','218.192.101.10','218.192.101.15','112.5.220.199','112.5.220.198','112.5.220.197','112.5.220.196','112.5.220.195','112.5.220.193','112.5.220.192','112.5.220.62');
        //$postip = $ip[array_rand($ip,1)];
        //curl_setopt($ch, CURLOPT_PROXY, $ip); //代理IP
        //curl_setopt($ch, CURLOPT_PROXYPORT, $port); //代理端口
        curl_setopt($ch, CURLOPT_COOKIE, $this->cookie);
        if ($this->isPost) {
            curl_setopt($ch, CURLOPT_POST, 1);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $this->curlPost);
        } else {
            curl_setopt($ch, CURLOPT_POST, 0);
        }
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
        //curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true); // 从证书中检查SSL加密算法是否存在
        //curl_setopt($ch, CURLOPT_HEADER, 1);
        curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
        curl_setopt($ch, CURLOPT_URL, $this->url);
        curl_setopt($ch, CURLOPT_HTTPHEADER, $this->header);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_REFERER, $this->referer);
        $page_content = curl_exec($ch);
        curl_close($ch);
        return $page_content;
    }
    
    public function getMovieComing() {
        $this->referer = 'http://m.maoyan.com';
        $this->cookie = 'bid=XfjM2SvLv-A; ll="118137"; _vwo_uuid_v2=6FA9AE4C26C1CE720E21D6563D466BA9|2679b96ecbc89a60e4a31ca967336c2a; _ck_desktop_mode=1; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1464597665%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Ds0xD8d-Okh9c1zxfZrxta3gu-cvOjUuQxBginwzpNDNGkJcGRIfiid3XMlmCEEksqvWYYgPN2AdfNueKnvGdu_sNcejBvikPypwyr8zWXBq%26wd%3D%26eqid%3Daa28a16100afcd5100000002574bae86%22%5D; as="https://movie.douban.com/subject/6873736/?from=showing"; ps=y; push_noty_num=1; push_doumail_num=0; __utma=30149280.1803998130.1464072318.1464592219.1464595569.8; __utmc=30149280; __utmz=30149280.1464583525.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.13850; __utma=223695111.582615646.1464105735.1464595569.1464597665.5; __utmc=223695111; __utmz=223695111.1464577681.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=83fef4e30ba0682b.1464105734.5.1464598089.1464595658.; ap=1';
        $this->userAgent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1';
        $this->url = 'http://m.maoyan.com/ajax/comingList?ci=1&token=&limit=1000';
        $info = $this->getCurl();
        $info = str_replace('\\"', "'", $info);
        $obj = json_decode($info);
        $movies = $obj->coming;
        $i = 1;
        D('movie_coming')->save(array('orderid' => 0));
        foreach ($movies as $k => $v) {
            $mid = $v->id;
            $movie_coming = D('movie_coming')->where(array('mid' => $mid))->find();
            if ($movie_coming) {
                $moviecoming = D('movie_coming');
                $moviecoming->mid = $mid;
                $moviecoming->scm = '';
                $moviecoming->showinfo = isset($v->showInfo) ? $v->showInfo : '';
                $moviecoming->rt = $v->rt;
                $moviecoming->dir = '';
                //$moviecoming->star = $v->star;
                $moviecoming->cat = '';
                $moviecoming->wish = $v->wish;
                $moviecoming->nm = $v->nm;
                $moviecoming->poster = $v->img;
                $moviecoming->dur = '';
                $moviecoming->orderid = $i;
                $moviecoming->save();
            } else {
                $moviecoming = D('movie_coming');
                $moviecoming->mid = $v->id;
                $moviecoming->scm = '';
                $moviecoming->showinfo = isset($v->showInfo) ? $v->showInfo : '';
                $moviecoming->rt = $v->rt;
                $moviecoming->dir = '';
                //$moviecoming->star = $v->star;
                $moviecoming->cat = '';
                $moviecoming->wish = $v->wish;
                $moviecoming->nm = $v->nm;
                $moviecoming->poster = $v->img;
                $moviecoming->dur = '';
                $moviecoming->orderid = $i;
                $moviecoming->add();
            }
            ++$i;
        }
    }
}

这里面主要就是采用curl函数抓取猫眼电影。这里的api地址是:

http://m.maoyan.com/ajax/comingList?ci=1&token=&limit=1000

我们可以看到这是个json数据格式,而php是最擅长处理字符串的语言。

$obj = json_decode($info);

一个json_decode 就可以转换成对象形式,接下来就是数据的存储了。

梦回故里整理。

Comments are closed.