Python所写豆瓣读书的爬虫,方便大家搜罗各种美美书

源代码来自Lanbing,喜欢读书和不断学习的朋友可以电脑访问他的网站好书推荐:http://sobook.lanbing510.info/。

import time
import requests
import numpy as np
from bs4 import BeautifulSoup
from openpyxl import Workbook
import pandas as pd


#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63'}
]
cookie='bid=Cvo6tScaDhc; douban-fav-remind=1; __gads=ID=08ab08e395239f91-22e836faa2c400d8:T=1604765497:S=ALNI_Mb0ja0RNpMqaeBcyC-7FJbwXMnxsw; viewed="20537674"; gr_user_id=27cbbe13-30ca-4e8d-9be2-3d90612d5566; __utmz=30149280.1605193881.2.2.utmcsr=sobook.lanbing510.info|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=DB03B1435CD1A1B65F6CF14A89F76A717|8c4878c74cf2c321f763295803bda082; ll="118282"; __utmc=30149280; dbcl2="149139516:TT7Vg/ii9y8"; ck=k6Zq; push_doumail_num=0; push_noty_num=0; __utmv=30149280.14913; __utma=30149280.2030410437.1604765498.1605428802.1605431529.4; __utmt_douban=1; __utmb=30149280.1.10.1605431529'
cookie_dict={i.split("=")[0]:i.split("=")[-1] for i in cookie.split("; ")} # 请求网页(发现必须带cookie)

#在https://proxy.coderbusy.com/找到的IP地址(不停刷新即可)
pro=['43.254.168.56:53281',
'178.128.125.239:44331',
'178.128.146.4:62225',
'101.132.143.232:80',
'128.199.202.122:3128',
'165.225.12.111:10605',
'165.225.222.101:10605',
'115.223.7.110:80',
'110.243.11.47:9999',
'114.99.12.213:3000',
'165.225.76.174:10605',
'165.225.200.104:10605',
'191.101.39.154:80',
'165.225.92.123:10605'
    ]

def book_spider(book_tag):
    book_list=[]
    
    for page_num in range(0,5,1):
        url='http://book.douban.com/tag/'+ book_tag +'?start='+str(page_num*20)+'&type=T' #m每页20个item
        print('*****************************************************************************')
        print(url)
        waittime = np.random.rand()*21
        print(waittime)
        time.sleep(waittime + 10) #随机休息时间,避免反爬机制
        req = requests.get(url, headers=hds[page_num%len(hds)],
                               cookies=cookie_dict
                               )
        print(req.status_code)
        time.sleep(np.random.rand()*19) #随机休息时间,避免反爬机制
        print(req.status_code,'开始爬取数据')
        plain_text= req.text 
        soup = BeautifulSoup(plain_text,features="lxml")
        list_soup = soup.find('ul',class_='subject-list')
        books = list_soup.find_all('li', class_='subject-item')
        for book_info in books:
                title = book_info.find('h2').get_text().replace(' ', '').replace('\n', '').replace('\r', '') 
                print(title)
                desc = book_info.find('div',class_='pub').get_text().strip()
                desc_list = desc.split('/')
                print(desc_list)
                book_url = book_info.find('h2').find('a').get('href')
                print(book_url)
                try:
                    subscript_info = book_info.find('p').get_text().strip()
                except:
                    subscript_info = '简介描述: 暂无'
                try:
                    author_info = '作者/译者: ' + '/'.join(desc_list[0:-3])
                except:
                    author_info ='作者/译者: 暂无'
                try:
                    pub_info = '出版信息: ' + '/'.join(desc_list[-3:])
                except:
                    pub_info = '出版信息: 暂无'
                try:
                    rating = book_info.find('span',class_='rating_nums').get_text().strip()
                except:
                    rating='0.0'
                try:
                    people_num = book_info.find('span', class_='pl').get_text().strip()
                except:
                    people_num ='0'
                print(title,rating,people_num,author_info,pub_info)
                book_list.append({'title':title,'book_url':book_url,'rating':people_num,'author_info':author_info,'pub_info':pub_info,'subscript_info':subscript_info})
        print('Downloading Information From Page %d' % page_num)
    return book_list


def get_people_num(url):
    try:
        req = requests.get(url, headers=hds[np.random.randint(0,len(hds))],
                           cookies=cookie_dict
                           )
        while req.status_code!=200:
            req = requests.get(url, headers=hds[np.random.randint(0,len(hds))],cookies=cookie_dict)
        plain_text= req.text 
    except Exception as e:
        print(e)
    soup = BeautifulSoup(plain_text,features="lxml")
    people_num=soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip()
    return people_num


def do_spider(book_tag_lists):
    book_lists=[]
    for book_tag in book_tag_lists:
        book_list=book_spider(book_tag)
        book_lists.append(book_list)
    return book_lists


def print_book_lists_excel(book_lists,book_tag):
    file_path = pd.ExcelWriter('douban-'+book_tag+'.xlsx')  # 打开excel文件
    sheetname =book_tag
    pf = pd.DataFrame(book_lists[i])
        # 替换空单元格
    pf.fillna(' ', inplace=True)
        # 输出
    pf.to_excel(file_path, encoding='utf-8', index=False,sheet_name=sheetname)
    file_path.save() 
    print("保存文件成功,处理结束")

if __name__=='__main__':
    book_tag_lists = [ '心理','判断与决策','算法','数据结构','经济','历史']
    #book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教']
    #book_tag_lists = ['思想','科技','科学','web','股票','爱情','两性']
    #book_tag_lists = ['计算机','机器学习','linux','android','数据库','互联网']
    #book_tag_lists = ['数学']
    #book_tag_lists = ['摄影','设计','音乐','旅行','教育','成长','情感','育儿','健康','养生']
    #book_tag_lists = ['商业','理财','管理']  
    #book_tag_lists = ['名著']
    #book_tag_lists = ['科普','经典','生活','心灵','文学']
    #book_tag_lists = ['科幻','思维','金融']
    #book_tag_lists = ['个人管理','时间管理','投资','文化','宗教']
   # book_tag_lists = ['时间管理','投资','文化','宗教','科幻','思维','金融','科普','经典','生活','心灵','文学','数学','传记','哲学','编程','创业','理财','社会学','佛教','心理','判断与决策','算法','数据结构','经济','历史']# 单个item测试
    for i in range(0, len(book_tag_lists),1):
        book_lists=do_spider(book_tag_lists)
        print_book_lists_excel(book_lists,book_tag_lists[i])

您可以选择一种方式赞助本站

支付宝转账赞助

支付宝扫一扫赞助

微信钱包扫描赞助

目前评论:1   其中:访客  0   博主  0

  1. avatar IMJMJ

    不错,我喜欢,收藏了、、、

评论加载中...

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

图片 表情