當(dāng)前位置：首頁(yè) > 编程资源 > 编程问答 >内容正文

编程问答

基于scrapy框架爬取新浪体育部分板块内容

發(fā)布時(shí)間：2024/1/8 编程问答 28 豆豆

生活随笔收集整理的這篇文章主要介紹了基于scrapy框架爬取新浪体育部分板块内容小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

import scrapy from selenium import webdriver from sohuPro.items import SohuproItem class SohuSpider(scrapy.Spider):name = 'sohu'# allowed_domains = ['www.xxx.com']start_urls = ['http://sports.sina.com.cn/']#需求：爬取新浪體育歐冠,西甲，意甲，德甲等5大板塊中的新聞內(nèi)容models_urls=[] #用來(lái)存儲(chǔ)5大板塊對(duì)應(yīng)的url鏈接def __init__(self):self.bro=webdriver.Chrome(executable_path='D:\python\Reptiliane\爬蟲\chromedriver.exe')def parse(self, response):#1.先對(duì)首頁(yè)發(fā)起請(qǐng)求，解析出5大板塊對(duì)應(yīng)的url鏈接li_list=response.xpath('//*[@id="j_top"]/div[2]/div/ul[1]/li')print(len(li_list))#2.從li_list中解析出5大板塊對(duì)應(yīng)的urlalist = [0,1,2, 3, 4]for index in alist:model_url=li_list[index].xpath('./a/@href').extract_first()print(model_url)self.models_urls.append(model_url)#對(duì)每一個(gè)板塊的url進(jìn)行請(qǐng)求for url in self.models_urls:yield scrapy.Request(url=url,callback=self.parse_detail)#每個(gè)板塊的內(nèi)容不是動(dòng)態(tài)生成的，可以直接請(qǐng)求，為了熟悉中間件的作用，所以我們用selenium模塊進(jìn)行請(qǐng)求def parse_detail(self,response):li_list=response.xpath('//*[@id="contest_list"]/li')for li in li_list:game_time=li.xpath('./p/text()').extract_first()game_team=li.xpath('./a//text()').extract_first()#實(shí)例化item對(duì)象，進(jìn)行持久化存儲(chǔ)item=SohuproItem()item['game_time']=game_timeitem['game_team']=game_teamyield item #提交給管道def close_spider(self,spider):#瀏覽器關(guān)閉self.bro.close() items.py import scrapy class SohuproItem(scrapy.Item):game_time=scrapy.Field()game_team=scrapy.Field() middlewares.py from scrapy import signals# useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter import time from scrapy.http import HtmlResponse class SohuproDownloaderMiddleware:def process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of# installed downloader middleware will be calledreturn None#攔截所有請(qǐng)求的response響應(yīng)，進(jìn)行篩選，將需求中的板塊鏈接進(jìn)行篡改def process_response(self, request, response, spider):#spider:爬蟲類bro=spider.bro #實(shí)例化了瀏覽器對(duì)象for request_url in spider.models_urls:bro.get(request_url)time.sleep(3)page_text=bro.page_source#包含了動(dòng)態(tài)加載的內(nèi)容#實(shí)例化一個(gè)新的響應(yīng)對(duì)象(需求：包含5大板塊的內(nèi)容)，代替原來(lái)舊的響應(yīng)對(duì)象new_response=HtmlResponse(url=request_url,body=page_text,encoding='utf8',request=request)time.sleep(3)return new_responseelse:return responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpass pipelines.py from itemadapter import ItemAdapter import pymysqlclass SohuproPipeline:def open_spider(self,spider):self.conn=pymysql.connect(host='localhost',user='root',password='root',port=3306,db='scrapy',charset='utf8')print('爬蟲開(kāi)始！！！')def process_item(self, item, spider):self.cursor = self.conn.cursor()value=(item['game_time'],item['game_team'])try:sql='insert into job_7(game_time,game_team) value(%s,%s)'self.cursor.execute(sql,value)self.conn.commit()print('插入成功')except:print('插入失敗！！')return itemdef close_spider(self, spider):print('爬蟲結(jié)束！！')self.cursor.close()self.conn.close() settings.py # Scrapy settings for sohuPro project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'sohuPro'SPIDER_MODULES = ['sohuPro.spiders'] NEWSPIDER_MODULE = 'sohuPro.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'# Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL='ERROR'# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = False# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False# Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #}# Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'sohuPro.middlewares.SohuproSpiderMiddleware': 543, # }# Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = {'sohuPro.middlewares.SohuproDownloaderMiddleware': 543, }# Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {'sohuPro.pipelines.SohuproPipeline': 300, }# Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

總結(jié)

以上是生活随笔為你收集整理的基于scrapy框架爬取新浪体育部分板块内容的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺(jué)得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

上一篇： view-ui中select全选实现
下一篇：一个人值多少钱？