python爬取糗事百科的段子

  • A+
所属分类:爬虫专区

以下是源代码:

 源代码

这里是内容

# -*- coding:utf-8 -*-
import requests
import re
import json
class Redstar:
    def __init__(self):
        pass
    def getdata(self,url):
        self.html = requests.get(url)
        return self.html.text
    def parse_page(self,html):
        pattern = re.compile('<h2>(.*?)</h2>.*?<.*?content">.*?<span>(.*?)</span>',re.S)
        items = re.findall(pattern,html)
        for item in items:
            yield {
                'name': item[0].strip('\n'),
                'content': item[1].strip('\n').replace('<br/>','').replace('\n','')
            }
    def writer(self,content):
        with open('qiushibaike.txt','a',encoding='utf-8')as f:
            f.write(json.dumps(content,ensure_ascii=False)+ '\n')
            f.close()

    def main(self,num):
            url = "https://www.qiushibaike.com/8hr/page/" + str(num)
            html = self.getdata(url)
            for item in Redstar.parse_page(html):
                self.writer(item)
                print(item)

if __name__ == '__main__':
    Redstar = Redstar()
    for i in range(13):
         Redstar.main(i)
  • 我的微信
  • 这是我的微信扫一扫
  • weinxin
  • 我的微信公众号
  • 我的微信公众号扫一扫
  • weinxin

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: