1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | #簡易爬蟲 import urllib.request as ur url = '任意網址' request =ur.Request(url, headers={ "User-Agent":"------" }) #建立Request物件,附加上Request Headers資訊(由網頁抓) with ur.urlopen(request) as response: #抓取網頁上data data = response.read().decode('utf-8') #print(data) import bs4 #解析原始碼mod root = bs4.BeautifulSoup(data, "html.parser") #讓BeautifulSoup解析HTML文件 #print(root.title.string) titles = root.find_all("div",class_="title") #找出符合類別定義('class_='的標籤('div') #print(titles) for title in titles: #for迴圈找出網頁中所有符合的檔案(用find只會抓最新一個) if title.a !=None: #排除不存在的標籤後抓取data print(title.a.string) |
Direct link: https://paste.plurk.com/show/WguJUkFlD34PfWe3NrbV