源代码来自Lanbing,喜欢读书和不断学习的朋友可以电脑访问他的网站好书推荐:http://sobook.lanbing510.info/。
import time import requests import numpy as np from bs4 import BeautifulSoup from openpyxl import Workbook import pandas as pd #Some User Agents hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63'} ] cookie='bid=Cvo6tScaDhc; douban-fav-remind=1; __gads=ID=08ab08e395239f91-22e836faa2c400d8:T=1604765497:S=ALNI_Mb0ja0RNpMqaeBcyC-7FJbwXMnxsw; viewed="20537674"; gr_user_id=27cbbe13-30ca-4e8d-9be2-3d90612d5566; __utmz=30149280.1605193881.2.2.utmcsr=sobook.lanbing510.info|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=DB03B1435CD1A1B65F6CF14A89F76A717|8c4878c74cf2c321f763295803bda082; ll="118282"; __utmc=30149280; dbcl2="149139516:TT7Vg/ii9y8"; ck=k6Zq; push_doumail_num=0; push_noty_num=0; __utmv=30149280.14913; __utma=30149280.2030410437.1604765498.1605428802.1605431529.4; __utmt_douban=1; __utmb=30149280.1.10.1605431529' cookie_dict={i.split("=")[0]:i.split("=")[-1] for i in cookie.split("; ")} # 请求网页(发现必须带cookie) #在https://proxy.coderbusy.com/找到的IP地址(不停刷新即可) pro=['43.254.168.56:53281', '178.128.125.239:44331', '178.128.146.4:62225', '101.132.143.232:80', '128.199.202.122:3128', '165.225.12.111:10605', '165.225.222.101:10605', '115.223.7.110:80', '110.243.11.47:9999', '114.99.12.213:3000', '165.225.76.174:10605', '165.225.200.104:10605', '191.101.39.154:80', '165.225.92.123:10605' ] def book_spider(book_tag): book_list=[] for page_num in range(0,5,1): url='http://book.douban.com/tag/'+ book_tag +'?start='+str(page_num*20)+'&type=T' #m每页20个item print('*****************************************************************************') print(url) waittime = np.random.rand()*21 print(waittime) time.sleep(waittime + 10) #随机休息时间,避免反爬机制 req = requests.get(url, headers=hds[page_num%len(hds)], cookies=cookie_dict ) print(req.status_code) time.sleep(np.random.rand()*19) #随机休息时间,避免反爬机制 print(req.status_code,'开始爬取数据') plain_text= req.text soup = BeautifulSoup(plain_text,features="lxml") list_soup = soup.find('ul',class_='subject-list') books = list_soup.find_all('li', class_='subject-item') for book_info in books: title = book_info.find('h2').get_text().replace(' ', '').replace('\n', '').replace('\r', '') print(title) desc = book_info.find('div',class_='pub').get_text().strip() desc_list = desc.split('/') print(desc_list) book_url = book_info.find('h2').find('a').get('href') print(book_url) try: subscript_info = book_info.find('p').get_text().strip() except: subscript_info = '简介描述: 暂无' try: author_info = '作者/译者: ' + '/'.join(desc_list[0:-3]) except: author_info ='作者/译者: 暂无' try: pub_info = '出版信息: ' + '/'.join(desc_list[-3:]) except: pub_info = '出版信息: 暂无' try: rating = book_info.find('span',class_='rating_nums').get_text().strip() except: rating='0.0' try: people_num = book_info.find('span', class_='pl').get_text().strip() except: people_num ='0' print(title,rating,people_num,author_info,pub_info) book_list.append({'title':title,'book_url':book_url,'rating':people_num,'author_info':author_info,'pub_info':pub_info,'subscript_info':subscript_info}) print('Downloading Information From Page %d' % page_num) return book_list def get_people_num(url): try: req = requests.get(url, headers=hds[np.random.randint(0,len(hds))], cookies=cookie_dict ) while req.status_code!=200: req = requests.get(url, headers=hds[np.random.randint(0,len(hds))],cookies=cookie_dict) plain_text= req.text except Exception as e: print(e) soup = BeautifulSoup(plain_text,features="lxml") people_num=soup.find('div',{'class':'rating_sum'}).findAll('span')[1].string.strip() return people_num def do_spider(book_tag_lists): book_lists=[] for book_tag in book_tag_lists: book_list=book_spider(book_tag) book_lists.append(book_list) return book_lists def print_book_lists_excel(book_lists,book_tag): file_path = pd.ExcelWriter('douban-'+book_tag+'.xlsx') # 打开excel文件 sheetname =book_tag pf = pd.DataFrame(book_lists[i]) # 替换空单元格 pf.fillna(' ', inplace=True) # 输出 pf.to_excel(file_path, encoding='utf-8', index=False,sheet_name=sheetname) file_path.save() print("保存文件成功,处理结束") if __name__=='__main__': book_tag_lists = [ '心理','判断与决策','算法','数据结构','经济','历史'] #book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教'] #book_tag_lists = ['思想','科技','科学','web','股票','爱情','两性'] #book_tag_lists = ['计算机','机器学习','linux','android','数据库','互联网'] #book_tag_lists = ['数学'] #book_tag_lists = ['摄影','设计','音乐','旅行','教育','成长','情感','育儿','健康','养生'] #book_tag_lists = ['商业','理财','管理'] #book_tag_lists = ['名著'] #book_tag_lists = ['科普','经典','生活','心灵','文学'] #book_tag_lists = ['科幻','思维','金融'] #book_tag_lists = ['个人管理','时间管理','投资','文化','宗教'] # book_tag_lists = ['时间管理','投资','文化','宗教','科幻','思维','金融','科普','经典','生活','心灵','文学','数学','传记','哲学','编程','创业','理财','社会学','佛教','心理','判断与决策','算法','数据结构','经济','历史']# 单个item测试 for i in range(0, len(book_tag_lists),1): book_lists=do_spider(book_tag_lists) print_book_lists_excel(book_lists,book_tag_lists[i])
2022年09月02日 上午6:47 沙发
不错,我喜欢,收藏了、、、