python小爬虫(爬取职位信息和博客文章信息)
生活随笔
收集整理的這篇文章主要介紹了
python小爬虫(爬取职位信息和博客文章信息)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1.python爬取招聘信息
簡單爬取智聯招聘職位信息(僅供學習)
# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author? : xiaofeng @Time? ? : 2018/12/18 16:31 @Desc : Less interests,More interest. (爬取智聯招聘職位數據) @Project : python_appliction @FileName: zhilianzhaopin.py @Software: PyCharm @Blog? ? :https://blog.csdn.net/zwx19921215 """import pymysql as db import requests# mysql配置信息 mysql_config = {'host': '101.0.2.110','user': 'test','password': 'test','database': 'xiaofeng','charset': 'utf8' }# url url = 'https://data.highpin.cn/api/JobSearch/Search'""" 爬取智聯招聘職位數據 @:param page 頁碼 @:param position 職位關鍵字 """def zhilian(page, position):# 封裝頭信息headers = {'Referer': 'https://www.highpin.cn/zhiwei/','Origin': 'https://www.highpin.cn','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36','Accept': 'application/json, text/javascript, * / *; q=0.01',}# 表單信息datas = {'Q': position,'pageIndex': page}resp = requests.post(url, data=datas, headers=headers)result = resp.json()return result""" 控制臺輸出 """def print_data(result):body = result['body']['JobList']print(body)""" 數據入庫 """def insert(result):print("insert......")database = db.connect(**mysql_config)for item in result:print(item)sql = "INSERT INTO zhilian(JobID,JobTitle,ReferrerType,CompanyName,AnnualSalaryMin," \"AnnualSalaryMax,JobLactionStr,JobLactionID,JobTags\,JobDegree,JobDegreeId,WorkExperience,WorkExperienceID,CompanyIndustry,CompanyIndustryID," \"CompanyType,CompanyTypeID,PublishDate,CompanyScale,SalaryWhite) \VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"# list convert to strJobLactionID = str(item['JobLactionID'])CompanyIndustryID = str(item['CompanyIndustryID'])if 'JobTags' in item:JobTags = str(item['JobTags'])else:JobTags = ''cursor = database.cursor()cursor.execute(sql, (item['JobID'], item['JobTitle'], item['ReferrerType'], item['CompanyName'], item['AnnualSalaryMin'],item['AnnualSalaryMax'],item['JobLactionStr'], JobLactionID, JobTags, item['JobDegree'], item['JobDegreeId'],item['WorkExperience'],item['WorkExperienceID'], item['CompanyIndustry'], CompanyIndustryID, item['CompanyType'],item['CompanyTypeID'], item['PublishDate'], item['CompanyScale'], item['SalaryWhite']))database.commit()cursor.close()database.close()def main(position):result = zhilian(1, position)page_count = result['body']['PageCount']print("---------------共", page_count, "頁-------------")page = 1while page <= page_count:print('----------------第', page, '頁-----------------')result = zhilian(page, position)# print_data(result)body = result['body']['JobList']insert(body)page = page + 1if __name__ == '__main__':main('java')?
控制臺輸出信息
?
入庫數據
?
2.python爬取csdn博客文章
python簡單爬取csdn博客文章列表(僅供學習)
步驟:
1.分頁獲取博客url
2.解析html 獲取指定信息
# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author? : xiaofeng @Time? ? : 2018/12/20 11:30 @Desc : Less interests,More interest.(爬取csdn博客文章列表) @Project : python_appliction @FileName: csdn.py @Software: PyCharm @Blog? ? :https://blog.csdn.net/zwx19921215 """ import requests from lxml import html# 聲明頭信息 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' }""" 解析html,獲取相關數據 @:param url 網頁路徑 """def parse_html(url):response = requests.get(url=url, headers=headers)text = html.fromstring(response.text)hrefs = text.xpath('//h4/a/@href')title = text.xpath('//h4/a/text()')"""文章摘要、發布日期、閱讀數、評論數等其余屬性自行按需獲取即可"""# 移出第一個元素鏈接,不知道為啥 csdn 博客默認都多了一條數據,也就多了一個鏈接hrefs.pop(0)titles = []# 格式化標題for item in title:st = str(item).replace('\n', '').strip()if st != '':titles.append(st)# 組合輸出# 移出第一個元素,不知道為啥 csdn 博客默認都多了一條數據 “帝都的凜冬”titles.pop(0)i = 0for item in titles:results = {'標題': titles[i],'鏈接': hrefs[i]}i = i + 1print(results)""" 自動判斷頁數 @:param page_url 頁面路徑 @:param page 頁號 """def get_page(page_url, page):url = page_url + str(page)print('url=', url)response = requests.get(url=url, headers=headers)text = html.fromstring(response.text)next_page = text.xpath('//div[@class="ui-paging-container"]/ul/li[@class="js-page-next js-page-action ui-pager"]/text()')if next_page:parse_html(url)page = page + 1get_page(page_url, page)else:return -1""" 分頁爬取相關數據 @:param page_url 頁面路徑 @:param page 頁號 """def get_page2(page_url, page):url = page_url + str(page)while page <= 10:print('\n')print("----------------------第", page, "頁--------------------")print('url=', url)print('\n')parse_html(url)page = page + 1url = page_url + str(page)if __name__ == '__main__':page_url = 'https://blog.csdn.net/zwx19921215/article/list/'get_page2(page_url, 1)總結
以上是生活随笔為你收集整理的python小爬虫(爬取职位信息和博客文章信息)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: pycharm的配置_pycharm怎么
- 下一篇: python数据分析入门