zoukankan html css js c++ java

<scrapy爬虫>爬取猫眼电影top100详细信息

1.创建scrapy项目

dos窗口输入:

scrapy startproject maoyan

cd maoyan

2.编写item.py文件(相当于编写模板,需要爬取的数据在这里定义)

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MaoyanItem(scrapy.Item):
    # define the fields for your item here like:
    #影片中文名称/英文名称
    ztitle = scrapy.Field()
    etitle = scrapy.Field()
    #影片类型
    type = scrapy.Field()
    #导演
    dname = scrapy.Field()
    #主演
    star = scrapy.Field()
    #上映时间
    releasetime = scrapy.Field()
    #影片时间
    time = scrapy.Field()
    # 评分
    score = scrapy.Field()
    #图片链接
    image = scrapy.Field()
    #详情信息
    info = scrapy.Field()

3.创建爬虫文件

dos窗口输入:

scrapy genspider -t crawl myspider maoyan.com

4.编写myspider.py文件(接收响应,处理数据)

# -*- coding: utf-8 -*-
import scrapy
#导入链接规则匹配
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
#导入模板
from maoyan.items import MaoyanItem


class MaoyanSpider(CrawlSpider):
    name = 'myspider'
    allowed_domains = ['maoyan.com']
    start_urls = ['https://maoyan.com/board/4?offset=0']

    rules = (
        Rule(LinkExtractor(allow=r'offset=d+'),follow=True),
        Rule(LinkExtractor(allow=r'/films/d+'),callback='parse_maoyan',follow=False),
    )

    def parse_maoyan(self, response):
        item = MaoyanItem()
        # 影片中文名称/英文名称
        item['ztitle'] = response.xpath('//h3/text()').extract()[0]
        item['etitle'] = response.xpath('//div[@class="ename ellipsis"]/text()').extract()[0]
        # 影片类型
        item['type'] = response.xpath('//li[@class="ellipsis"][1]/text()').extract()[0]
        # 导演
        item['dname'] = response.xpath('//a[@class="name"]/text()').extract()[0].strip()
        # 主演
        star_1 = response.xpath('//li[@class="celebrity actor"][1]//a[@class="name"]/text()').extract()[0].strip()
        star_2 = response.xpath('//li[@class="celebrity actor"][2]//a[@class="name"]/text()').extract()[0].strip()
        star_3 = response.xpath('//li[@class="celebrity actor"][3]//a[@class="name"]/text()').extract()[0].strip()
        item['star'] = star_1 + "\" + star_2 + '\' +star_3
        # 上映时间
        item['releasetime'] = response.xpath('//li[@class="ellipsis"][3]/text()').extract()[0]
        # 影片时间
        item['time'] = response.xpath('//li[@class="ellipsis"][2]/text()').extract()[0].strip()[-5:]
        # 评分,没抓到
        # item['score'] = response.xpath('//span[@class="stonefont"]/text()').extract()[0]
        item['score'] = "None"
        # 图片链接
        item['image'] = response.xpath('//img[@class="avatar"]/@src').extract()[0]
        # 详情信息
        item['info'] = response.xpath('//span[@class="dra"]/text()').extract()[0].strip()

        yield item

5.编写pipelines.py(存储数据)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json

class MaoyanPipeline(object):
    def __init__(self):
        self.filename = open('maoyan.txt','wb')

    def process_item(self, item, spider):
        text = json.dumps(dict(item),ensure_ascii=False) + '
'
        self.filename.write(text.encode('utf-8'))
        return item

    def close_spider(self,spider):
        self.filename.close()

6.编写settings.py(设置headers,pipelines等)

robox协议

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

headers

DEFAULT_REQUEST_HEADERS = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  # 'Accept-Language': 'en',
}

pipelines

ITEM_PIPELINES = {
    'maoyan.pipelines.MaoyanPipeline': 300,
}

7.运行爬虫

dos窗口输入:

scrapy crawl myspider

运行结果:

emmmm,top100只爬到99个,

问题:

源码里面评分是□.□!!!全是套路,外面可以找到这个评分,懒得折腾了

单独爬取zname是100个,可能是哪个属性的xpath匹配,网页详情页没有,实现功能就行了

爬取成功

8.存储到mysql数据库

在mysql数据库建立相应的数据库和表:

改写一下pipelines.py文件即可:

import pymysql.cursors

class MaoyanPipeline(object):
    def __init__(self):
        #连接数据库
        self.connect = pymysql.connect(
            host = 'localhost',
            user = 'root',
            password = '',
            database = 'maoyan',
            charset = 'utf8'  # 别写成utf-8
            )
        self.cursor = self.connect.cursor()  # 建立游标

    def process_item(self, item, spider):
        item = dict(item)
        sql = "insert into maoyantop100(ztitle,etitle,type,dname,star,releasetime,time,score,image,info) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql,(item['ztitle'],item['etitle'],item['type'],item['dname'],item['star'],item['releasetime'],item['time'],item['score'],item['image'],item['info'],))
        self.connect.commit()
        return item

    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()

　　运行:

存储成功:

查看全文

相关阅读:
eclipse-source not found
eclipse-[org.apache.hadoop.util.Shell]
Oracle—字段多行合并（LISTAGG）
Selenium IDE 命令使用——断言
 Selenium IDE录制脚本——Chrome浏览器使用介绍
 Selenium家族谱(三生三世)
python自动化测试之多线程生成BeautifulReport测试报告
 Python接口自动化之ExtentHTMLTestRunner测试报告
 [Java] Tomcat
[刷题] 1002 写出这个数 (20分)

原文地址：https://www.cnblogs.com/shuimohei/p/10400814.html