重载Scrapy下载中间件及Cookie池
Contents
:::tip scrapy中间件重构 :::
重写
import json # 处理json的包
import redis # Python操作redis的包
import random # 随机选择
from .useragent import agents # 导入请求头列表
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware # UserAegent中间件
from scrapy.downloadermiddlewares.retry import RetryMiddleware # 重试中间件
class UserAgentmiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
- 定义了一个类 UserAgentmiddleware 继承自 UserAgentMiddleware 。
- 定义了函数 process_request(request, spider) ,Scrapy 每一个 request 通过中间 件都会调用这个方法。
- 随机选择一个 User-Agent。
- 设置 request 的 User-Agent 为我们随机的 User-Agent。
Cookie 池
维护一个 Cookie 池 (redis),备些功能:
- 获取 Cookie
- 更新 Cookie
- 删除 Cookie
- 判断 Cookie 是否可用进行相对应的操作(比如重试)
新建一个 cookies.py 的文件,导入需要的文件:
import requests
import json
import redis
import logging
from .settings import REDIS_URL ##获取settings.py中的REDIS_URL
登陆用的账号密码 以 Key:value 的形式存入 redis 数据库。不推荐使用 db0(账号密码单独使用一个 db 进行存储)
获取 Cookie
import requests
import json
import redis
import logging
from .settings import REDIS_URL
logger = logging.getLogger(__name__)
# 使用REDIS_URL链接Redis数据库, deconde_responses=True 这个参数必须要,数据会变成byte形式 完全没法用
reds = redis.Redis.from_url(REDIS_URL, db=2, decode_responses=True)
login_url = 'http://haoduofuli.pw/wp-login.php'
# 获取Cookie
def get_cookie(account, password):
s = requests.Session()
payload = {
'log': account,
'pwd': password,
'rememberme': "forever",
'wp-submit': "登录",
'redirect_to': "http://http://www.haoduofuli.pw/wp-admin/",
'testcookie': "1"
}
response = s.post(login_url, data=payload)
cookies = response.cookies.get_dict()
logger.warning("获取Cookie成功!(账号为:%s)" % account)
return json.dumps(cookies)
使用 requests 模块提交表单登陆获得 Cookie,返回一个通过 Json 序列化后的 Cookie(如果不序列化,存入 Redis 后会变成 Plain Text 格式的,后面取出来 Cookie 就没法用) 将 Cookie 写入 Redis 数据库(其它 Spider 也能使用这个 Cookie ):
def init_cookie(red, spidername):
redkeys = reds.keys()
for user in redkeys:
password = reds.get(user)
if red.get("%s:Cookies:%s--%s" % (spidername, user, password)) is None:
cookie = get_cookie(user, password)
red.set("%s:Cookies:%s--%s"% (spidername, user, password), cookie)
重写 Cookie 中间件
class CookieMiddleware(RetryMiddleware):
def __init__(self, settings, crawler):
# 继承父类之后;子类是不能用 def init() 方法的,需要重载父类才可以用
RetryMiddleware.__init__(self, settings)
self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True #decode_responses设置取出的编码为str
# 往 redis 中添加 cookie
init_cookie(self.rconn, crawler.spider.name)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def process_request(self, request, spider):
redisKeys = self.rconn.keys()
while len(redisKeys) > 0:
elem = random.choice(redisKeys)
if spider.name + ':Cookies' in elem:
cookie = json.loads(self.rconn.get(elem))
request.cookies = cookie
request.meta["accountText"] = elem.split("Cookies:")[-1]
break
访问 settings 的方法官方文档:官方文档
完整代码
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import json
import redis
import random
from .useragent import agents
from .cookies import init_cookie, remove_cookie, update_cookie
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from scrapy.downloadermiddlewares.retry import RetryMiddleware
import logging
logger = logging.getLogger(__name__)
class UserAgentmiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
class CookieMiddleware(RetryMiddleware):
def __init__(self, settings, crawler):
RetryMiddleware.__init__(self, settings)
self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True) #decode_responses设置取出的编码为str
init_cookie(self.rconn, crawler.spider.name)
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def process_request(self, request, spider):
redisKeys = self.rconn.keys()
while len(redisKeys) > 0:
elem = random.choice(redisKeys)
if spider.name + ':Cookies' in elem:
cookie = json.loads(self.rconn.get(elem))
request.cookies = cookie
request.meta["accountText"] = elem.split("Cookies:")[-1]
break
#else:
#redisKeys.remove(elem)
#def process_response(self, request, response, spider):
#
# """
# 判断cookie是否失效
# 进行相应的操作,如更新cookie 删除不能用的账号
# """