首先我们要创建项目
scrapy startproject 项目名字
#创建完成之后我们首先找到settings.py文件
#这里说的在你爬虫的过程时是否遵守规则,我们当然不尊守了,我已经修改过了,
ROBOTSTXT_OBEY = False
#我门说到用代理池,那我们的ip从那来,可以去网上代理网站购买,也可以自己从网上自己爬取,爬取完之后,我们要进行筛选
我自己写了个脚本用与筛选出有用的ip
pm.py
import pymysql
from queue import Queue
class PM(object):
def __init__(self):
#连接数据库
self.conn = pymysql.connect('10.15.112.30', 'bingbing', 'a11112222', 'jobweb', charset='utf8')
#创建游标
self.cursor = self.conn.cursor()
#创建列队
self.proxy_q = Queue()
def get_all_proxy(self):
#从数据库查询所有的ip
self.cursor.execute('select * from proxypool')
res = self.cursor.fetchall()
#把查询出的ip循环加入列队
for proxy in res:
self.proxy_q.put(proxy)
def filter_proxy(self):
#队列非空的情况下进行whiile循环
while not self.proxy_q.empty():
#把ip从队列里取出来
p = self.proxy_q.get()
base_url = 'http://www.baidu.com/s?wd=ip'
proxy = {
'http': 'http://%s:%s' % (p[1], p[2]),
'https': 'http://%s:%s' % (p[1], p[2])
}
try:
response = requests.get(base_url, proxies=proxy, timeout=10)
#判断响应码
if 200 <= response.status_code < 300:
html = response.text
if '本机IP' in html:
print(p[1], '可用------------------')
else:
# 删除代理
self.drop_proxy(p)
else:
# 删除代理
self.drop_proxy(p)
except Exception as e:
# 删除代理
self.drop_proxy(p)
#封装删除代理的函数
def drop_proxy(self, p):
#删除sql语句
sql = 'delete from proxypool where id=%s' % p[0]
try:
row = self.cursor.execute(sql)
self.conn.commit()
print('删除代理:', p[1])
except Exception as e:
print('删除代理失败')
def close(self):
#关闭数据库
self.cursor.close()
self.conn.close()
def main(self):
# 加载代理
self.get_all_proxy()
# 生成协程对象
g_list = []
for i in range(2):
g = gevent.spawn(self.filter_proxy)
g_list.append(g)
# 启动
gevent.joinall(g_list)
def random(self):
#随机读取一个ip
sql = 'select * from proxypool order by rand() limit 1'
self.cursor.execute(sql)
proxy = self.cursor.fetchone()
return proxy
if __name__ == '__main__':
from gevent import monkey
monkey.patch_all()
import gevent
import time
import requests
pm = PM()
pm.main()
# pm.random()
pm.close()
然后我写了一个脚本,用与随机代理的进出
mymiddlewares.py
from fake_useragent import UserAgent
from user import settings
import random
import base64
from user.pm import PM
import pymysql
# 随机UA中间件
class RandomUA(object):
def __init__(self):
self.ua = UserAgent()
#请求头的随机获取
def process_request(self, request, spider):
request.headers['User-Agent'] = self.ua.random
# 随机代理中间件
class RandomProxy(object):
def __init__(self):
self.pm = PM()
pass
def process_request(self, request, spider):
#从数据库取出ip,此方法在pm.py 写道
proxy = self.pm.random()
# print(proxy)
# request.meta['proxy'] = 'http://%s' % proxy['host']
p = 'http://%s:%s' % (proxy[1], proxy[2])
# print(p)
#把ip 传到请求头
request.meta['proxy'] = p
# 随机认证代理中间件
class RandomAuthProxy(object):
def process_request(self, request, spider):
proxy = random.choice(settings.AUTH_PROXIES)
auth = base64.b64encode(bytes(proxy['auth'], encoding="utf-8"))
# 设置代理认证信息
request.headers['Proxy-Authorization'] = b'Basic ' + auth
# 设置代理信息
request.meta['proxy'] = 'http://%s' % proxy['host']
# spidermiddleware
from scrapy.exceptions import CloseSpider
#爬虫异常处理
class MySpiderMiddleware(object):
def process_spider_input(self, response, spider):
print('响应码,,,,,,,,', response.status)
if not 200 <= response.status <= 300:
raise CloseSpider('爬虫异常,退出')
return None
def process_spider_output(self, response, result, spider):
for res in result:
yield res
#接下来我们写爬虫
baidu.py
# -*- coding: utf-8 -*-
import scrapy
class BosszpSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['www.baidu.com']
start_urls = ['http://www.baidu.com/s?wd=ip']
#settings设置
custom_settings = {
'DOWNLOADER_MIDDLEWARES':{
#写你spider 的中间件的路径
'boss.mymiddlewares.RandomUA':1,
'boss.mymiddlewares.RandomProxy':2,
},
# 下载超时 5- 10 秒
'DOWNLOAD_TIMEOUT': 10,
# 下载重试次数 2 -3 次
'RETRY_TIMES': 3,
}
def parse(self, response):
#匹配出ip
data = response.xpath('//span[@class="c-gap-right"]/text()').extract()
print(data)
pass
#输出结果
'本机IP:\xa0203.6.149.130']
['本机IP:\xa047.74.9.208']
#这里我测试了两次
#这就时代理池的最简单的用法了,
如用错误,欢迎指正
————————————————
版权声明:本文为CSDN博主「yang_bingo」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/yang_bingo/article/details/80559528