2021-05-26--CHEN scary
生活随笔
收集整理的這篇文章主要介紹了
2021-05-26--CHEN scary
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
主要的程序
import scrapy from ..items import LianjiatestchenItem import jsonclass LianjiachenSpider(scrapy.Spider):name = 'Lianjiachen'# allowed_domains = ['bj.lianjia.com/zufang']start_urls = ['http://bj.lianjia.com/zufang/']def parse(self, response, **kwargs):# 獲取城區(qū)area_bj_list = response.xpath('//*[@id="filter"]/ul[2]//li[position()>1]//a/@href').extract()# 拼接urlfor i in area_bj_list:real_url = "https://bj.lianjia.com" + i# print(real_url)yield scrapy.Request(url=real_url, callback=self.parse_page_url, dont_filter=True)passdef parse_page_url(self, response):# 獲取最大頁(yè)數(shù)max_page_lianjia = response.xpath("""//*[@id="content"]/div[1]/div[2]/@data-totalpage""").extract()# print(max_page_lianjia)for i in range(1, int(max_page_lianjia[0]) + 1):# 拼接頁(yè)數(shù)鏈接url = response.url + "pg" + str(i)# print(url)yield scrapy.Request(url=url, callback=self.paese_message, dont_filter=True)passdef paese_message(self, response):# 獲取所有的div# print(response.url)all_div = response.xpath("""//*[@id="content"]/div[1]/div[1]//div""")for house in all_div:name_type_orientation = house.xpath(""".//p[@class='content__list--item--title']/a/text()""").extract()[0].strip().split(' ')# 房子名字name = name_type_orientation[0]# 房子戶型house_type = name_type_orientation[1]# 房子朝向orientation = name_type_orientation[2]# 房子區(qū)域area = house.xpath(""".//p[@class='content__list--item--des']/a/text()""").extract()[0]# print(area)# 房子街道street = house.xpath(""".//p[@class='content__list--item--des']/a[2]/text()""").extract()[0]# print(street)# 房子具體位置concrete = house.xpath(""".//p[@class='content__list--item--des']/a[3]/text()""").extract()[0]# 房子租金lease = house.xpath(""".//span/em/text()""").extract()[0]# print(lease)# characteristic 特點(diǎn)characteristic = house.xpath(""".//p[3]//i/text()""").extract()characteristic = ''.join([i + '-' for i in characteristic]) if characteristic else '空的'characteristic = characteristic.rstrip('-')# print(characteristic)# 維護(hù)時(shí)間maintenance_time = house.xpath(""".//p[4]/span[2]/text()""").extract()[0]# print(maintenance_time)item = LianjiatestchenItem()item['name'] = nameitem['house_type'] = house_typeitem['orientation'] = orientationitem['street'] = streetitem['area'] = areaitem['concrete'] = concreteitem['lease'] = leaseitem['characteristic'] = characteristicitem['maintenance_time'] = maintenance_time# 獲取詳情頁(yè)地址detail_url = house.xpath(""".//p[@class='content__list--item--title']/a/@href""").extract()[0]detail_url = 'https://bj.lianjia.com' + detail_url# print(detail_url)yield scrapy.Request(url=detail_url, callback=self.detail_page, meta={'item': item})passdef detail_page(self, response):item = response.meta['item']loupan = response.xpath("""//*[@id="aside"]/ul/li[3]/span[2]/text()""").extract()[0]loupan = loupan.split(' ')[1].split('/')floor_properties = loupan[0]floor_num = loupan[1]item['floor_properties'] = floor_propertiesitem['floor_num'] = floor_numphone_url = 'https://ex.lianjia.com/sdk/phone400'# 手機(jī)參數(shù)datadata_dict = response.xpath("""//*[@id="aside"]/div[2]/div[1]/@data-agent""").extract()[0]data_dict = json.loads(data_dict)ucId = data_dict['ucId']digV = data_dict['digV']# print(digV)adId = json.loads(digV)['adId']data = {"adId": str(adId),"digV": str(digV),"hdicCityId": "110000","mediumId": "100000032","mobileType": "AGENT","required400": "true","ucId": str(ucId)}yield scrapy.Request(url=phone_url, callback=self.phone_num, method='post', body=json.dumps(data),headers={"Content-Type": "application/json"}, meta={'item': item})passdef phone_num(self, response):item = response.meta['item']phone = json.loads(response.text)phone_num = phone['data'][0]['phone400']item['phone_num'] = phone_num# print(phone)print(phone_num)yield itempass管道
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interface from itemadapter import ItemAdapter import pymysqlclass LianjiaSpiderPipeline:def __init__(self):self.conn_mysql()def conn_mysql(self):self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456',database='test_chen_Lianjia', charset='utf8')self.cursor = self.db.cursor()passdef process_item(self, item, spider):pic = item["pic"]title = item["title"]city_area = item["city_area"]business_area = item["business_area"]road_area = item["road_area"]toward = item["toward"]area = item["area"]room = item["room"]hall = item["hall"]toliet = item["toliet"]sign_list = item["sign_list"]publish_time = item["publish_time"]lese = item["lese"]phone = item["phone"]floor_type = item["floor_type"]floor_num = item["floor_num"]sql = """insert into linajia_table (pic,title,city_area,business_area,road_area,toward,area,room,hall,toliet,sign_list,publish_time,lese,phone,floor_type,floor_num) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""try:# 執(zhí)行sql語(yǔ)句self.cursor.execute(sql, (pic, title, city_area, business_area, road_area, toward, area, room, hall, toliet, sign_list, publish_time,lese, phone, floor_type, floor_num))# 提交到數(shù)據(jù)執(zhí)行self.db.commit()except Exception as e:print(e)# 回滾 要么執(zhí)行,要么不執(zhí)行self.db.rollback()return itemITEM
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass LianjiatestchenItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()name = scrapy.Field()house_type = scrapy.Field()orientation = scrapy.Field()street = scrapy.Field()area = scrapy.Field()concrete = scrapy.Field()lease = scrapy.Field()characteristic = scrapy.Field()maintenance_time = scrapy.Field()floor_properties = scrapy.Field()floor_num = scrapy.Field()phone_num = scrapy.Field()pass1111
總結(jié)
以上是生活随笔為你收集整理的2021-05-26--CHEN scary的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: mysql有没有开窗函数_mysql实现
- 下一篇: 计算机工程与应用退休被拒,如果您退休了,