过去一年博客网站增长情况

以国内博客界最大的博客导航网站卢松松导航收录的网站数量来看过去一年内博客网站增长情况，今年6月数据看同比去年6月子增长了2.9%。疫情以来受到群众空余时间变多，网络技术5G的普及，增长最大的是短视频的播发增长，原有的图文传播方式变得越来越缓慢。

增长量上看还是经济比较活跃，文化底蕴较为深厚的地区增加的数量最多，值得注意的是经济较为欠发展的地区增长率较大，这个也和去年同期基数小有关系。

网站数量分地区数量排名和去年对比类似。

地区	2020Nov	2020Dec	2021Jan	2021Mar	2021Jun	2022Jun	去年增长
QQ空间	29	29	29	29	29	31	2
上海	740	742	745	746	750	767	17
云南	274	275	275	277	281	288	7
内蒙古	100	100	100	100	100	107	7
北京	1523	1525	1536	1539	1549	1578	29
友情站	259	260	262	266	274	295	21
台湾	161	161	161	161	161	162	1
吉林	116	116	117	117	118	120	2
四川	943	946	952	957	963	998	35
国外	509	509	513	515	521	537	16
天津	175	175	175	175	176	182	6
宁夏	53	53	53	53	53	55	2
安徽	520	521	523	524	527	544	17
山东	1108	1111	1115	1119	1127	1157	30
山西	243	244	246	249	252	260	8
广东	2634	2642	2655	2667	2692	2749	57
广西	348	349	352	354	358	368	10
新疆	78	79	79	79	82	89	7
江苏	1028	1032	1034	1039	1043	1067	24
江西	343	345	350	350	354	362	8
河北	546	547	554	557	562	575	13
河南	1024	1029	1035	1042	1052	1083	31
浙江	870	872	878	878	883	902	19
海南	96	96	97	97	99	105	6
湖北	846	846	854	857	864	891	27
湖南	441	442	442	443	441	463	22
澳门	26	26	26	26	26	26	0
甘肃	141	141	144	146	147	154	7
福建	646	646	649	653	656	674	18
西藏	12	12	12	12	12	13	1
贵州	233	233	238	238	242	261	19
辽宁	271	271	271	271	272	277	5
逆袭会	26	26	26	26	26	26	0
重庆	426	426	429	433	440	466	26
陕西	492	494	496	497	501	523	22
青海	71	71	71	71	71	71	0
香港	123	123	124	126	131	136	5
黑龙江	178	178	41	41	42	49	7
总计	17652	17693	17659	17730	17877	18411	534

获取网站数量代码：

# -*- coding: utf-8 -*-

# 加载引用库模块
import requests
from bs4 import BeautifulSoup
#import time
import pandas as pd
import chardet
import socket
import datetime
import time
import numpy as np
def get_headers():
    headers=[{'user-agent':'Mozilla/5.0'},
    {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'},
    {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59'},
    # 谷歌
    {'user-agent':'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36'},
    {'user-agent':'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11'},
    {'user-agent':'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16'},
        ]    
    return np.random.choice(headers)
def get_IPs():
    history_IP=[
        {'htttp': 'http://36.102.172.36:16919'},
        {'htttp': 'http://61.188.107.102:15991'},
         {'htttp': 'http://182.38.126.162:23583'},
        {'htttp': 'http://117.26.193.225:16872'}  ]
    return np.random.choice(history_IP)   
# 测试网站连接是否正常
Targeturl = 'https://daohang.lusongsong.com/'
headers={'user-agent':'Mozilla/5.0'}
neirong=requests.get(Targeturl,headers=get_headers(), proxies=get_IPs() )
neirong.encoding
if neirong.encoding == 'utf-8':
    print('Website page connect succesfull!')
else:
    print('Can not connect this target website page, please check')
#页面的HTML列表
url = 'https://daohang.lusongsong.com/'
print('开始爬取页面:', url)
#soup 不是浏览器解码器，需要使用requests模块
r = requests.get(url,headers=get_headers(), proxies=get_IPs())
#网使用gzip将网页压缩了，要先解码才行，使用single_html.content会自动解码gzip和delate传输编码的响应数据。
after_gzip = r.content
chardet.detect(after_gzip)
single_html_gzip = after_gzip.decode('UTF-8')  
#single_html_t = single_html_gzip.text
homepage = BeautifulSoup(single_html_gzip,'html.parser')
category = homepage.find('div',attrs={'class': "bd", 'id':"coolsite"})
category = category.find_all('strong')
all_datas =[]
#获取内容收录博客及其网址
def Get_leibei_count(html,Leibie):
    r = requests.get(html,headers=get_headers(), proxies=get_IPs())#,verify=False)
    #网使用gzip将网页压缩了，要先解码才行，使用single_html.content会自动解码gzip和delate传输编码的响应数据。
    after_gzip = r.content
    chardet.detect(after_gzip)
    single_html_gzip = after_gzip.decode('UTF-8')  
    soup = BeautifulSoup(single_html_gzip,"html.parser")
    html_docs = soup.find_all('div', class_='path')
    html_doc = html_docs[2].get_text().split('[')
    #print(html_doc)
    count = html_doc[-1].split(']')[0]
    print(count)
    return count
for i in category:
    Leibie = i.get_text()
    urltemp = i.find('a').get('href')
    if urltemp[0:4] != 'http':
        leibieurl ='https://daohang.lusongsong.com' + urltemp
    else:
        leibieurl = urltemp
        print('类别名错误'*9)
    webcount = Get_leibei_count(leibieurl,Leibie)
    print('现在爬取',Leibie,'网站：',leibieurl,'获得【',webcount,'】')
    all_datas.append({'Leibie':Leibie,'leibieurl':leibieurl,'webcount':webcount})
#  将数据写入新excel文件
today=str(datetime.date.today())
pf = pd.DataFrame(all_datas)
order = ["Leibie", "leibieurl", "webcount"]  # 指定列的顺序
pf = pf[order]
file_path = pd.ExcelWriter('Blog-lusongsong-daohang-list-'+today+'.xlsx')  # 打开excel文件
# 替换空单元格
pf.fillna(' ', inplace=True)
# 输出
pf.to_excel(file_path, encoding='utf-8', index=False,sheet_name="sheet1")
file_path.save()
print("保存文件成功，处理结束")