from urllib.request import u rlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())
BeautifulSoup通過定位 HTML 標簽來格式化和組織復雜的網絡信息,用簡單易用的 Python 對象為我們展現 XML 結構信息。
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoupdef getTitle(url):try:html = urlopen(url)except HTTPError as e:print(e)return Nonetry:bsObj = BeautifulSoup(html.read(), "lxml")title = bsObj.body.h1except AttributeError as e:print(e)return Nonereturn titletitle = getTitle("http://pythonscraping.com/pages/page1.html")
if title == None:print("Title could not be found!")
else:print(title)
復雜的HTML解析
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSouptry:html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
except HTTPError as e:print(e)try:bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:print(e)namelist = bsObj.findAll("span", {"class":"green"})
for name in namelist:print(name.get_text())
????????.get_text() 會把你正在處理的 HTML 文檔中所有的標簽都清除,然后返回一個只包含文字的字符串。假如你正在處理一個包含許多超鏈接、段落和標簽的大段源代碼,那么 .get_text() 會把這些超鏈接、段落和標簽都清除掉,只剩下一串不帶標簽的文字。
導航樹
子標簽和后代標簽
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSouptry:html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:print(e)try:bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:print(e)for child in bsObj.find("table", {"id":"giftList"}).children:print(child)
處理兄弟標簽
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSouptry:html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:print(e)try:bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:print(e)for sibling in bsObj.find("table", {"id":"giftList"}).tr.next_siblings:print(sibling)
父標簽處理
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSouptry:html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:print(e)try:bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:print(e)print(bsObj.find("img", {"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
正則表達式
郵箱:[A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import retry:html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:print(e)try:bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:print(e)images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for img in images:print(img["src"])
獲取屬性
對于一個標簽可以用myTag.attrs獲取所有屬性
myTag.attrs["src"]表示myTag的src的屬性
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import retry:html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:print(e)try:bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:print(e)images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for img in images:print(img.attrs["src"])