proxy_pool 在Scrapy中使用的方法

yb3bgrhw  于 2023-02-04  发布在  其他
关注(0)|答案(2)|浏览(212)

设置一个中间件
DOWNLOADER_MIDDLEWARES = {
'Article.middlewares.RandomUserAgentMiddleware': 543,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
}

class RandomUserAgentMiddleware(object):
    logger = logging.getLogger(__name__)
    def __init__(self, crawler):
        super(RandomUserAgentMiddleware, self).__init__()

        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)
        request.headers.setdefault('User-Agent', get_ua())
        request.meta['proxy'] = 'http://' + self.proxy()

    def proxy(self):
        proxy = requests.get("http://127.0.0.1:5010/get").text
        try:
            print('get proxy ...')
            # proxy = requests.get("http://127.0.0.1:5010/get").text
            ip = {"http": "http://" + proxy, "https": "https://" + proxy}
            r = requests.get("http://www.baidu.com", proxies=ip, timeout=4)
            print(r.status_code)
            if r.status_code == 200:
                return proxy
        except:
            print('get proxy again ...')
            self.delete_proxy(proxy)
            return self.proxy()

    def process_response(self, request, response, spider):
        '''对返回的response处理'''
        # 如果返回的response状态不是200,重新生成当前request对象
        if response.status != 200:
            print("again response ip:")
            # 对当前reque加上代理
            request.meta['proxy'] = 'http://' + self.proxy()
            return request
        return response

    def process_exception(self, request, exception, spider):
        self.logger.debug('Get exception')
        request.meta['proxy'] = 'http://' + self.proxy()
        return request

    def delete_proxy(self, proxy):
        requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
ctehm74n

ctehm74n1#

现在请求返回的是JSON格式,注意需要将 proxy = requests.get("http://127.0.0.1:5010/get").text改为proxy = requests.get("http://127.0.0.1:5010/get").json().get("proxy")

dsekswqp

dsekswqp2#

怎样知道是否已经运行了? 我运行之后只显示 RandomUserAgentMiddleware , 但没有其他proxy信息。btw,已改为proxy = requests.get(" http://127.0.0.1:5010/get").json().get("proxy.json().get(%22proxy) ")

相关问题