基于爬蟲框架scrapy的bili爬蟲
# 蜘蛛程序
import scrapy
from scrapy import Selector
from ..items import GetItem ?# 導(dǎo)入
class BiliSpider(scrapy.Spider):
? ?name = 'bili'
? ?allowed_domains = ['bilibili.com']
? ?start_urls = ['https://www.bilibili.com/';] ? ?# 爬取的頁(yè)面
? ?def parse(self, response):
? ? ? ?sel = Selector(response)
? ? ? ?list_items = sel.xpath('/html/body/div[2]/div[2]/main/div[2]/div/div[1]/div')
? ? ? ?for list_item in list_items:
? ? ? ? ? ?spider_item = GetItem()
? ? ? ? ? ?spider_item['title'] = list_item.css('h3::attr(title)').extract() ?# 標(biāo)題
? ? ? ? ? ?spider_item['author'] = list_item.css('span.bili-video-card__info--author::text').extract() ?# 作者
? ? ? ? ? ?spider_item['time'] = list_item.css('span.bili-video-card__info--date::text').extract() ?# 時(shí)間
? ? ? ? ? ?spider_item['link'] = list_item.css('h3 > a::attr(href)').extract() ?# 鏈接
? ? ? ? ? ?yield spider_item

# items文件
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class GetItem(scrapy.Item):
? ?# define the fields for your item here like:
? ?# name = scrapy.Field()
? ?title = scrapy.Field()
? ?author = scrapy.Field()
? ?time = scrapy.Field()
? ?link = scrapy.Field()

# 配置
在setting文件中打開cookies,添加請(qǐng)求頭

# 命令行啟動(dòng),保存為csv文件
?scrapy crawl bili -o bili.csv

