为了帮朋友上架一个合作方的产品到他的电子商务网站上,因为对方没有整理图片并打包,每上一个图片要从对方的网站上保存下来显然是费时费力的。为此特意花了几秒钟给编写了个小爬虫,喝杯奶茶的时间搞定全部图片,并产品分文件夹存好。
- 先准备好需要调取的库,这里使用到urlretrieve函数
# -*- coding: utf-8 -*- """ Created on 2021-9-5 爬取http://ruocin.com/ 产品信息和图片 """ import requests #导入urlopen 函数 from urllib.request import urlopen #导入beautifullSoup from bs4 import BeautifulSoup #导入urlretrieve函数,用于下载图片 from urllib.request import urlretrieve import chardet import socket import os """ 图片(文件)下载,核心方法是 urllib.urlrequest 模块的 urlretrieve()方法 urlretrieve(url, filename=None, reporthook=None, data=None) url: 文件url filename: 保存到本地时,使用的文件(路径)名称 reporthook: 文件传输时的回调函数 data: post提交到服务器的数据 该方法返回一个二元元组("本地文件路径",<http.client.HTTPMessage对象>) """
Targeturl ='http://ruocin.com/en/product.asp' neirong=requests.get(Targeturl) print(neirong.encoding) if neirong.encoding == 'ISO-8859-1': print('Target Website page connect succesfull!\n'*6) else: print('Can not connect this target website page, please check')
网站下载进度函数
def callbackfunc(blocknum, blocksize, totalsize): '''''回调函数 @blocknum: 已经下载的数据块 @blocksize: 数据块的大小 @totalsize: 远程文件的大小 ''' percent = 100.0 * blocknum * blocksize / totalsize if percent > 100: percent = 100 print("%.2f%%"% percent)
文件夹函数,这个很有用,有就跳过,没有就建立
def mkdir(path): # 引入模块 import os # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 # 创建目录操作函数 os.makedirs(path) print(path+' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print(path+' 目录已存在') return False
内容下载主要函数
def productinfodownload(productname,producturl,mkpath): mkdir(mkpath) productlistcontent = requests.get(producturl).text productlissoup = BeautifulSoup(productlistcontent,"html.parser") #print(productlissoup) productlistimgs = productlissoup.find('div',class_="product_pics_lt left") #print(productlistdiv) productimgs = productlistimgs.find_all('img') Proimgseq = 0 for proimg in productimgs: proimgsrc = proimg.get('src') proimgsrc = 'http://ruocin.com/en/'+proimgsrc Proimgseq = Proimgseq +1 print(Proimgseq,proimgsrc) #图片下载地址 productimgfilename= mkpath + productname + '-'+ str(Proimgseq)+'.'+ proimgsrc.split('.')[-1] #设置超时时间为30s socket.setdefaulttimeout(30) #解决下载不完全问题且避免陷入死循环 try: urlretrieve(proimgsrc,productimgfilename) except socket.timeout: count = 1 while count <= 5: try: urlretrieve(proimgsrc,productimgfilename) break except socket.timeout: err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count print(err_info) count += 1 if count > 5: print("downloading picture fialed!") print('$'*18) # 读取右边的内容 productcontenrt = productlissoup.find('div',class_="product_pics_rt right") productcontenrt = productcontenrt.text print(productcontenrt) productcontenrtfile = mkpath + productname + '规格.txt' productcontenrtoutput = open(productcontenrtfile,'w',encoding='utf-8') productcontenrtoutput.write(productcontenrt) productcontenrtoutput.close() # 读取下边的内容 productcontendown = productlissoup.find('div',class_="product_desp") productcontendown = productcontendown.text print(productcontendown) productcontenrtfile = mkpath + productname + 'description.txt' productcontenrtoutput = open(productcontenrtfile,'w',encoding='utf-8') productcontenrtoutput.write(productcontendown) productcontenrtoutput.close() # 读取下边的图片 productcontendownimg = productlissoup.find('div',class_="product_desp") print(productcontendownimg.text) productdownimgs = productcontendownimg.find_all('img') Proimgseqdown = 0 for proimg in productdownimgs: proimgsrc = proimg.get('src') proimgsrc = 'http://ruocin.com'+proimgsrc Proimgseqdown = Proimgseqdown +1 print(Proimgseqdown,proimgsrc) #图片下载地址 productimgfilename= mkpath + productname + '-detail-'+ str(Proimgseqdown)+'.'+ proimgsrc.split('.')[-1] #设置超时时间为30s socket.setdefaulttimeout(30) #解决下载不完全问题且避免陷入死循环 try: urlretrieve(proimgsrc,productimgfilename) except socket.timeout: count = 1 while count <= 5: try: urlretrieve(proimgsrc,productimgfilename) break except socket.timeout: err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count print(err_info) count += 1 if count > 5: print("downloading picture fialed!") print('#'*28)
内容不多,直接建一个数组存地址吧
Ruocinpage = [ 'http://ruocin.com/en/product.asp', 'http://ruocin.com/en/product.asp?Page=2', 'http://ruocin.com/en/product.asp?Page=3', 'http://ruocin.com/en/product.asp?Page=4' ]
准备就绪,下个主函数开启爬爬爬吧
for pageNum in range(0,4,1): pageurl = Ruocinpage[pageNum] print('现在爬取产品列表页', pageNum, pageurl) productlistcontent = requests.get(pageurl).text productlissoup = BeautifulSoup(productlistcontent,"html.parser") #print(productlissoup) productlistdiv = productlissoup.find('div',class_="pro_list") #print(productlistdiv) products = productlistdiv.find_all('li') print('该页产品数量:',len(products)) for product in products: productname = product.find('span').text print(productname) producturl = product.find('span').find('a').get('href') producturl = 'http://ruocin.com/en/'+ producturl print(producturl) productimg = product.find('img').get('src') productimg = 'http://ruocin.com/en/'+productimg print(productimg) # 定义要创建的目录 mkpath = productname +"/" # 调用函数创建目录 mkdir(mkpath) #图片下载地址 productimgfilename= mkpath + productname + '.'+ productimg.split('.')[-1] #设置超时时间为30s socket.setdefaulttimeout(30) #解决下载不完全问题且避免陷入死循环 try: urlretrieve(productimg,productimgfilename) except socket.timeout: count = 1 while count <= 5: try: urlretrieve(productimg,productimgfilename) break except socket.timeout: err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count print(err_info) count += 1 if count > 5: print("downloading picture fialed!") print('$'*18) print('爬取产品内页') productinfodownload(productname,producturl,mkpath) print('爬取下一个产品') print('爬取下一个页产品列表')
非常迅速搞定,朋友很满意,给加了杯奶茶,接着把中文的内容也扒拉
#还是建个表寸地址 RuocinCNpage = [ 'http://ruocin.com/product.asp', 'http://ruocin.com/product.asp?Page=2', 'http://ruocin.com/product.asp?Page=3', 'http://ruocin.com/product.asp?Page=4' ] #和上面函数一样,根据网页小改带点点参数 def productCNinfodownload(productname,producturl,mkpath): import chardet mkdir(mkpath) productlistcontent = requests.get(producturl) #网使用gzip将网页压缩了,要先解码才行,使用single_html.content会自动解码gzip和delate传输编码的响应数据。 after_gzip = productlistcontent.content chardet.detect(after_gzip) single_html_gzip = after_gzip.decode('UTF-8') productlissoup = BeautifulSoup(single_html_gzip,"html.parser") #print(productlissoup) productlistimgs = productlissoup.find('div',class_="product_pics_lt left") #print(productlistdiv) productimgs = productlistimgs.find_all('img') Proimgseq = 0 for proimg in productimgs: proimgsrc = proimg.get('src') proimgsrc = 'http://ruocin.com/'+proimgsrc Proimgseq = Proimgseq +1 print(Proimgseq,proimgsrc) #图片下载地址 productimgfilename= mkpath + productname + '-'+ str(Proimgseq)+'.'+ proimgsrc.split('.')[-1] #设置超时时间为30s socket.setdefaulttimeout(30) #解决下载不完全问题且避免陷入死循环 try: urlretrieve(proimgsrc,productimgfilename) except socket.timeout: count = 1 while count <= 5: try: urlretrieve(proimgsrc,productimgfilename) break except socket.timeout: err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count print(err_info) count += 1 if count > 5: print("downloading picture fialed!") print('$'*18) # 读取右边的内容 productcontenrt = productlissoup.find('div',class_="product_pics_rt right") productcontenrt = productcontenrt.text print(productcontenrt) productcontenrtfile = mkpath + productname + '规格.txt' productcontenrtoutput = open(productcontenrtfile,'w',encoding='utf-8') productcontenrtoutput.write(productcontenrt) productcontenrtoutput.close() # 读取下边的内容 productcontendown = productlissoup.find('div',class_="product_desp") productcontendown = productcontendown.text print(productcontendown) productcontenrtfile = mkpath + productname + 'description.txt' productcontenrtoutput = open(productcontenrtfile,'w',encoding='utf-8') productcontenrtoutput.write(productcontendown) productcontenrtoutput.close() # 读取下边的图片 productcontendownimg = productlissoup.find('div',class_="product_desp") print(productcontendownimg.text) productdownimgs = productcontendownimg.find_all('img') Proimgseqdown = 0 for proimg in productdownimgs: proimgsrc = proimg.get('src') proimgsrc = 'http://ruocin.com'+proimgsrc Proimgseqdown = Proimgseqdown +1 print(Proimgseqdown,proimgsrc) #图片下载地址 productimgfilename= mkpath + productname + '-detail-'+ str(Proimgseqdown)+'.'+ proimgsrc.split('.')[-1] #设置超时时间为30s socket.setdefaulttimeout(30) #解决下载不完全问题且避免陷入死循环 try: urlretrieve(proimgsrc,productimgfilename) except socket.timeout: count = 1 while count <= 5: try: urlretrieve(proimgsrc,productimgfilename) break except socket.timeout: err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count print(err_info) count += 1 if count > 5: print("downloading picture fialed!") print('#'*28)
中文的网页编码字符比较多乱七八糟的,导入一个简单易用的chardet库
import chardet #chardet提供自动检测字符编码的功能。. 当我们在处理一些不规范的网页的时候 for pageNum in range(0,4,1): pageurl = RuocinCNpage[pageNum] print('现在爬取产品列表页', pageNum, pageurl) productlistcontent = requests.get(pageurl) #网使用gzip将网页压缩了,要先解码才行,使用single_html.content会自动解码gzip和delate传输编码的响应数据。 after_gzip = productlistcontent.content chardet.detect(after_gzip) single_html_gzip = after_gzip.decode('UTF-8') productlissoup = BeautifulSoup(single_html_gzip,"html.parser") #print(productlissoup) productlistdiv = productlissoup.find('div',class_="pro_list") #print(productlistdiv) products = productlistdiv.find_all('li') print('该页产品数量:',len(products)) for product in products: productname = product.find('span').text print(productname) producturl = product.find('span').find('a').get('href') producturl = 'http://ruocin.com/'+ producturl print(producturl) productimg = product.find('img').get('src') productimg = 'http://ruocin.com/'+productimg print(productimg) # 定义要创建的目录 productname = productname.replace("/", "-") mkpath = "C:/Users/Administrator/Documents/RuocinCNpage/" + productname +"/" # 调用函数创建目录 mkdir(mkpath) #图片下载地址 productimgfilename= mkpath + productname + '.'+ productimg.split('.')[-1] #设置超时时间为30s socket.setdefaulttimeout(30) #解决下载不完全问题且避免陷入死循环 try: urlretrieve(productimg,productimgfilename) except socket.timeout: count = 1 while count <= 5: try: urlretrieve(productimg,productimgfilename) break except socket.timeout: err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count print(err_info) count += 1 if count > 5: print("downloading picture fialed!") print('$'*18) print('爬取产品内页') productCNinfodownload(productname,producturl,mkpath) print('爬取下一个产品') print('爬取下一个页产品列表')
搞定,两杯奶茶到手,手工饮茶。
不想复制代码的可以点这里有惊喜哟