基于scrapy框架爬取新浪体育部分板块内容
生活随笔
收集整理的這篇文章主要介紹了
基于scrapy框架爬取新浪体育部分板块内容
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
import scrapy
from selenium import webdriver
from sohuPro.items import SohuproItem
class SohuSpider(scrapy.Spider):name = 'sohu'# allowed_domains = ['www.xxx.com']start_urls = ['http://sports.sina.com.cn/']#需求:爬取新浪體育歐冠,西甲,意甲,德甲等5大板塊中的新聞內(nèi)容models_urls=[] #用來(lái)存儲(chǔ)5大板塊對(duì)應(yīng)的url鏈接def __init__(self):self.bro=webdriver.Chrome(executable_path='D:\python\Reptiliane\爬蟲\chromedriver.exe')def parse(self, response):#1.先對(duì)首頁(yè)發(fā)起請(qǐng)求,解析出5大板塊對(duì)應(yīng)的url鏈接li_list=response.xpath('//*[@id="j_top"]/div[2]/div/ul[1]/li')print(len(li_list))#2.從li_list中解析出5大板塊對(duì)應(yīng)的urlalist = [0,1,2, 3, 4]for index in alist:model_url=li_list[index].xpath('./a/@href').extract_first()print(model_url)self.models_urls.append(model_url)#對(duì)每一個(gè)板塊的url進(jìn)行請(qǐng)求for url in self.models_urls:yield scrapy.Request(url=url,callback=self.parse_detail)#每個(gè)板塊的內(nèi)容不是動(dòng)態(tài)生成的,可以直接請(qǐng)求,為了熟悉中間件的作用,所以我們用selenium模塊進(jìn)行請(qǐng)求def parse_detail(self,response):li_list=response.xpath('//*[@id="contest_list"]/li')for li in li_list:game_time=li.xpath('./p/text()').extract_first()game_team=li.xpath('./a//text()').extract_first()#實(shí)例化item對(duì)象,進(jìn)行持久化存儲(chǔ)item=SohuproItem()item['game_time']=game_timeitem['game_team']=game_teamyield item #提交給管道def close_spider(self,spider):#瀏覽器關(guān)閉self.bro.close()
items.py
import scrapy
class SohuproItem(scrapy.Item):game_time=scrapy.Field()game_team=scrapy.Field()
middlewares.py
from scrapy import signals# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import time
from scrapy.http import HtmlResponse
class SohuproDownloaderMiddleware:def process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of# installed downloader middleware will be calledreturn None#攔截所有請(qǐng)求的response響應(yīng),進(jìn)行篩選,將需求中的板塊鏈接進(jìn)行篡改def process_response(self, request, response, spider):#spider:爬蟲類bro=spider.bro #實(shí)例化了瀏覽器對(duì)象for request_url in spider.models_urls:bro.get(request_url)time.sleep(3)page_text=bro.page_source#包含了動(dòng)態(tài)加載的內(nèi)容#實(shí)例化一個(gè)新的響應(yīng)對(duì)象(需求:包含5大板塊的內(nèi)容),代替原來(lái)舊的響應(yīng)對(duì)象new_response=HtmlResponse(url=request_url,body=page_text,encoding='utf8',request=request)time.sleep(3)return new_responseelse:return responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpass
pipelines.py
from itemadapter import ItemAdapter
import pymysqlclass SohuproPipeline:def open_spider(self,spider):self.conn=pymysql.connect(host='localhost',user='root',password='root',port=3306,db='scrapy',charset='utf8')print('爬蟲開(kāi)始!!!')def process_item(self, item, spider):self.cursor = self.conn.cursor()value=(item['game_time'],item['game_team'])try:sql='insert into job_7(game_time,game_team) value(%s,%s)'self.cursor.execute(sql,value)self.conn.commit()print('插入成功')except:print('插入失敗!!')return itemdef close_spider(self, spider):print('爬蟲結(jié)束!!')self.cursor.close()self.conn.close()
settings.py
# Scrapy settings for sohuPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'sohuPro'SPIDER_MODULES = ['sohuPro.spiders']
NEWSPIDER_MODULE = 'sohuPro.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'sohuPro.middlewares.SohuproSpiderMiddleware': 543,
# }# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {'sohuPro.middlewares.SohuproDownloaderMiddleware': 543,
}# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'sohuPro.pipelines.SohuproPipeline': 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
總結(jié)
以上是生活随笔為你收集整理的基于scrapy框架爬取新浪体育部分板块内容的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: view-ui中select全选实现
- 下一篇: 一个人值多少钱?