import requests
import time
import random
from lxml import etree
import os
from multiprocessing.dummy import Pool
user_agent_list=[
'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
]
# 获取IP代理
def get_ip_list(headers, page):
ip_list = []
for i in range(int(page)):
# 爬取免费的IP
url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i+1)
# print("爬取网址为:", url)
#获取代理IP地址
web_data = requests.get(url, headers=headers)
if web_data.status_code == 200:
tree0 = etree.HTML(web_data.text)
ip_lists = tree0.xpath('//table/tbody/tr/td[@data-title="IP"]/text()');
port_lists = tree0.xpath('//table/tbody/tr/td[@data-title="PORT"]/text()')
type_lists = tree0.xpath('//table/tbody/tr/td[@data-title="类型"]/text()')
# print(port_lists)
for x,y in zip(ip_lists, port_lists):
ip_list.append(x + ":" + y)
time.sleep(3) # 防止访问频率过快,被封
return ip_list
# 记录IP到txt
def save_ip_list():
header = {'User-Agent': random.choice(user_agent_list)}
ip_list = get_ip_list(headers=header, page=3)
with open('userCsdn/ipList.txt', 'a') as fp:
for ip in ip_list:
fp.write('http:' + ip + '\n')
print('记录完成')
fp.close()
# 读取IP——txt,返回一个随机IP
def return_ip():
with open('userCsdn/ipList.txt', 'r') as fp:
ip_list = fp.readlines()
fp.close()
ip = random.choice(ip_list)
ip = ip.strip('\n')
return ip
##############################################################
# 用户主页,实际上没有用到
def findCsdnPage( url, ip ):
# 伪造cookie
cookies = dict(uuid='b18f0e70-8705-470d-bc4b-09a8da617e15', UM_distinctid='15d188be71d50-013c49b12ec14a-3f73035d-100200-15d188be71ffd')
header = {'User-Agent': random.choice(user_agent_list)}
proxies = {'http': ip}
page_text = requests.get(url, headers=header, cookies=cookies, proxies=proxies)
print(page_text.text)
# 将数据保存至txt中
def saveTxt(year, data):
with open('userCsdn/userCsdn_' + str(year) + '.txt', 'a', encoding="utf-8") as fp:
fp.write(str(data) + '\n')
fp.close()
# 根据码龄创建txt文件
def mdtxt(year):
if not os.path.exists('./userCsdn/userCsdn_' + str(year) + '.txt'):
file = open('./userCsdn/userCsdn_' + str(year) + '.txt', 'w')
file.close()
fansNums = 0; # 找到的粉丝数
# 主要方法,寻找符合条件的粉丝
def findFans( userid, page, ip ):
global fansNums
fans_url = 'https://blog.csdn.net//phoenix/web/v1/fans/list'
param = {
'page': page,
'pageSize': '50',
'blogUsername': userid
}
cookies = dict(uuid='b18f0e70-8705-470d-bc4b-09a8da617e15', UM_distinctid='15d188be71d50-013c49b12ec14a-3f73035d-100200-15d188be71ffd')
header = {'User-Agent': random.choice(user_agent_list)}
proxies = {'http': ip}
response = requests.get(fans_url, param, headers=header, cookies=cookies, proxies=proxies)
page_json = response.json()
# print(page_json['data']['list'])
for item in page_json['data']['list']:
if (int(item['years']) >= 20):
mdtxt(int(item['years']))
saveTxt(int(item['years']), item)
fansNums = fansNums + 1
##############################################################
def forSave(range_list):
for page in range(range_list[0], range_list[1]):
findFans('blogdevteam', str(page), ip)
print('page', page,'记录完成 码龄20年以上粉丝数:', fansNums)
if __name__ == '__main__':
ip = return_ip()
header = {'User-Agent': random.choice(user_agent_list)}
range_lists = [[1, 141], [142, 283], [284, 424], [425, 564], [565, 706]]
pool = Pool(5)
pool.map(forSave, range_lists)
pool.close()
pool.join()
评论