|
getCommentInfo.py:
- import requests
- from bs4 import BeautifulSoup
- from mylog import MyLog as mylog
- class Item(object):
- title = None
- fristAuthor = None
- fristTime = None
- reNum = None
- content = None
- lastAuthor = None
- lastTime = None
- class GetTiebaInfo(object):
- def __init__(self, url):
- self.url = url
- self.log = mylog()
- self.pageSum = 5
- self.urls = self.getUrls(self.pageSum)
- self.items = self.spider(self.urls)
- self.pipelines(self.items)
- def getUrls(self,pageSum):
- urls = []
- pns = [str(i*50) for i in range(pageSum)]
- ul = self.url.split('=')
- for pn in pns:
- ul[-1] = pn
- url = '='.join(ul)
- urls.append(url)
- self.log.info(u'取得URLS成功')
- return urls
- def spider(self, urls):
- items = []
- for url in urls:
- htmlContent = self.getResponseContent(url)
- soup = BeautifulSoup(htmlContent, 'lxml')
- tagsli = soup.find_all('li',attrs={'class':'j_thread_list clearfix'})
- for tag in tagsli:
- item = Item()
- item.title = tag.find('a',attrs={'class':'j_th_tit'}).get_text().strip()
- item.fristAuthor = tag.find('span',attrs={'class':'frs-author-name-warp'}).a.get_text().strip()
- item.fristTime = tag.find('span', attrs={'title': u'建立時間'.encode('utf8')}).get_text().strip()
- item.reNum = tag.find('span', attrs={'title': u'回覆'.encode('utf8')}).get_text().strip()
- item.content = tag.find('div', attrs={'class': 'threadlist_abs_onlyline '}).get_text().strip()
- item.lastAuthor = tag.find('span', attrs={'class': 'tb_icon_author_rely j_replayer'}).get_text().strip()
- item.lastTime = tag.find('span', attrs={'title': u'最後回覆時間'.encode('utf8')}).get_text().strip()
- items.append(item)
- self.log.info(u'取得標題為<<%s>>的項成功 ...' %item.title)
- return items
- def pipelines(self, items):
- fileName = u'擺渡貼吧_權力的遊戲.txt'
- with open(fileName, 'w', encoding='utf-8') as fp:
- for item in items:
- fp.write('title:%s \t author:%s \t firstTime:%s \n content:%s \n return:%s \n lastAuthor:%s \t lastTime:%s \n\n\n\n'
- %(item.title.encode('utf8'),item.fristAuthor.encode('utf8'),item.fristTime.encode('utf8'),item.content.encode('utf8')
- ,item.reNum.encode('utf8'),item.lastAuthor.encode('utf8'),item.lastTime.encode('utf8')))
- self.log.info(u'標題為<<%s>>的項輸入到"%s成功"'
- %(item.title, fileName.decode('utf8')))
- def getResponseContent(self, url):
- try:
- response = requests.get(url)
- except:
- self.log.error(u'Python 返回 URL:%s 資料失敗 ' %url)
- else:
- self.log.info(u'Python 返回 URL:%s 資料成功 ' %url)
- return response.text
- if __name__ == '__main__':
- url = u'http://tieba.baidu.com/f?kw=權力的遊戲&ie=utf-8&pn=50'
- GTI = GetTiebaInfo(url)
複製代碼 mylog.py
- import logging
- import getpass
- import sys
- class MyLog(object):
- def __init__(self):
- self.user = getpass.getuser()
- self.logger = logging.getLogger(self.user)
- self.logger.setLevel(logging.DEBUG)
- ### 日誌檔名
- self.logFile = sys.argv[0][0:-3]+'.log'
- self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
- ### 日誌顯示到銀幕上並輸出到日誌檔內
- self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
- self.logHand.setFormatter(self.formatter)
- self.logHand.setLevel(logging.DEBUG)
- self.logHandSt = logging.StreamHandler()
- self.logHandSt.setFormatter(self.formatter)
- self.logHandSt.setLevel(logging.DEBUG)
- self.logger.addHandler(self.logHand)
- self.logger.addHandler(self.logHandSt)
- ### 日誌的 5 個級別對應以下的5個函數
- def debug(self,msg):
- self.logger.debug(msg)
- def info(self,msg):
- self.logger.info(msg)
- def warning(self,msg):
- self.logger.warning(msg)
- def error(self,msg):
- self.logger.error(msg)
- def critical(self,msg):
- self.logger.critical(msg)
- if __name__ == '__main__':
- mylog = MyLog()
- mylog.debug(u"I'm debug 測試中文")
- mylog.info("I'm info")
- mylog.warning("I'm warning")
- mylog.error(u"I'm error 測試中文")
- mylog.critical("I'm critical")
複製代碼 參考文章
https://codeleading.com/article/55914188948/
|
|