Python 爬蟲模塊bs4 實戰一：獲取百度貼吧內容

woff · 發表於 2022-5-23 23:17:34

getCommentInfo.py：

import requests
from bs4 import BeautifulSoup
from mylog import MyLog as mylog
class Item(object):
title = None
fristAuthor = None
fristTime = None
reNum = None
content = None
lastAuthor = None
lastTime = None
class GetTiebaInfo(object):
def __init__(self, url):
self.url = url
self.log = mylog()
self.pageSum = 5
self.urls = self.getUrls(self.pageSum)
self.items = self.spider(self.urls)
self.pipelines(self.items)
def getUrls(self,pageSum):
urls = []
pns = [str(i*50) for i in range(pageSum)]
ul = self.url.split('=')
for pn in pns:
ul[-1] = pn
url = '='.join(ul)
urls.append(url)
self.log.info(u'取得URLS成功')
return urls
def spider(self, urls):
items = []
for url in urls:
htmlContent = self.getResponseContent(url)
soup = BeautifulSoup(htmlContent, 'lxml')
tagsli = soup.find_all('li',attrs={'class':'j_thread_list clearfix'})
for tag in tagsli:
item = Item()
item.title = tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
item.fristAuthor = tag.find('span',attrs={'class':'frs-author-name-warp'}).a.get_text().strip()
item.fristTime = tag.find('span', attrs={'title': u'建立時間'.encode('utf8')}).get_text().strip()
item.reNum = tag.find('span', attrs={'title': u'回覆'.encode('utf8')}).get_text().strip()
item.content = tag.find('div', attrs={'class': 'threadlist_abs_onlyline '}).get_text().strip()
item.lastAuthor = tag.find('span', attrs={'class': 'tb_icon_author_rely j_replayer'}).get_text().strip()
item.lastTime = tag.find('span', attrs={'title': u'最後回覆時間'.encode('utf8')}).get_text().strip()
items.append(item)
self.log.info(u'取得標題為<<%s>>的項成功 ...' %item.title)
return items
def pipelines(self, items):
fileName = u'擺渡貼吧_權力的遊戲.txt'
with open(fileName, 'w', encoding='utf-8') as fp:
for item in items:
fp.write('title:%s \t author:%s \t firstTime:%s \n content:%s \n return:%s \n lastAuthor:%s \t lastTime:%s \n\n\n\n'
%(item.title.encode('utf8'),item.fristAuthor.encode('utf8'),item.fristTime.encode('utf8'),item.content.encode('utf8')
,item.reNum.encode('utf8'),item.lastAuthor.encode('utf8'),item.lastTime.encode('utf8')))
self.log.info(u'標題為<<%s>>的項輸入到"%s成功"'
%(item.title, fileName.decode('utf8')))
def getResponseContent(self, url):
try:
response = requests.get(url)
except:
self.log.error(u'Python 返回 URL:%s 資料失敗 ' %url)
else:
self.log.info(u'Python 返回 URL:%s 資料成功 ' %url)
return response.text
if __name__ == '__main__':
url = u'http://tieba.baidu.com/f?kw=權力的遊戲&ie=utf-8&pn=50'
GTI = GetTiebaInfo(url)

複製代碼

mylog.py

import logging
import getpass
import sys
class MyLog(object):
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
### 日誌檔名
self.logFile = sys.argv[0][0:-3]+'.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
### 日誌顯示到銀幕上並輸出到日誌檔內
self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
### 日誌的 5 個級別對應以下的5個函數
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warning(self,msg):
self.logger.warning(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
if __name__ == '__main__':
mylog = MyLog()
mylog.debug(u"I'm debug 測試中文")
mylog.info("I'm info")
mylog.warning("I'm warning")
mylog.error(u"I'm error 測試中文")
mylog.critical("I'm critical")

複製代碼

參考文章
https://codeleading.com/article/55914188948/

賬號		自動登錄	找回密碼
密碼			註冊

[教學] Python 爬蟲模塊bs4 實戰一：獲取百度貼吧內容

相關帖子

瀏覽過的版塊