https://github.com/tao540752/ProxyPool.git
为什么要使用代理池
许多网站有专门的反爬虫措施,可能遇到封IP等问题,可通过定时的检测维护得到多个可用代理
代理池的要求
多站抓取,异步检测。定时筛查,持续更新。提供接口,易于抓取。
代理池架构
获取器–>过滤器–>代理队列–>定时检测
安装第三方库
environs==9.3.0
Flask==1.1.2
attrs==20.3.0
retrying==1.3.3
aiohttp==3.7.4
requests==2.25.1
loguru==0.5.3
pyquery==1.4.3
supervisor==4.2.1
redis==3.5.3
lxml==4.6.5
fake_headers==1.0.2
maxminddb_geolite2==2018.703
proxypool/setting.py
修改redis端口 密码 等
REDIS_PASSWORD = env.str('REDIS_PASSWORD', 'moyufed')
processors/server
查看路由,获取单个ip、所有可用iP
利用代理池爬去微信文章
from urllib.parse import urlencode
import requests
from requests.exceptions import ConnectionError
base_url='https://weixin.sogou.com/weixin?'
header={
'Cookie': 'SUID=D39A25B77C43910A0000000061D44F86; SUV=1641303947413962; SNUID=4D07BB299D98495C43C33F499EC1B02D; IPLOC=CN4401',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
proxy_pool_url='http://localhost:5555/random'
proxy=None
def get_proxy():
try:
response=requests.get(url=proxy_pool_url)
if response.status_code==200:
return response.text
else:
return None
except ConnectionError as e:
return None
def get_html(url):
global proxy
try:
if proxy:
proxies = {
"http": "http://139.0.31.146:8080"
}
response=requests.get(url=url,allow_redirects=False,headers=header,proxies=proxies)
else:
response=requests.get(url=url,allow_redirects=False,headers=header)
if response.status_code == 200:
return response.text
if response.status_code == 302:
print('302')
proxy=get_proxy()
if proxy:
print('using proxy',proxy)
return get_html(url)
else:
print('error')
return None
except ConnectionError:
return get_html(url)
def get_index(keyword,page):
data={
'query':keyword,
'type':2,
'page':page
}
queries=urlencode(data)
url=base_url+queries
html=get_html(url)
print(html)
def main():
for i in range(1,3):
get_index('风景',i)
if __name__=='__main__':
main()