scrapy结合加速乐完成无损爬取

Created
Sep 7, 2023 05:47 AM
Tags
中间件重要知识: 返回None代表继续通过 返回request就重新走调度流程
 
本文文章中 需要在settings中设置 COOKIES_ENABLED = True
这里超链接一篇文章(不理解COOKIES_ENABLED 的可以看看)—scrapy 中 COOKIES_ENABLED 设置_NealHuiwen的博客-CSDN博客
 
其次需要开启爬虫中的cookie中间件
class HandleCookieMiddleware: def process_request(self, request, spider): pass def process_response(self, request, response, spider): status_code = response.status if status_code == 521: print("jsl>>>>>>", response.url) cookies, response, ok = self.get_cookie(response) if not ok: return response request.cookies = cookies request.headers[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' return request
 
def get_cookie(self, response) -> (dict, Response, bool): if pattern := re.findall(r'cookie=(.*?);location', response.text): encrypt_jsfuck = pattern[0] else: return '', response, False # return response DJ = DecryptJsl() decrypt_jsfuck = str(execjs.eval(encrypt_jsfuck)) first_decrypt_cookie = decrypt_jsfuck.split('=')[1].split(';')[0] jsl_cookie_key = decrypt_jsfuck.split('=')[0] first_set_cookie = {} for c in str(response.headers.getlist('Set-Cookie')[0].decode("utf-8")).split(";"): if "=" not in c: continue first_set_cookie[c.split("=")[0]] = c.split("=")[1] first_cookies = { **first_set_cookie, jsl_cookie_key: first_decrypt_cookie} res1 = requests.get(response.url, cookies=first_cookies, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', }) data = json.loads(re.findall(r';go\((.*?)\)', res1.text)[0]) second_decrypt_cookie = DJ.get_parameter(data) cookies = { **first_set_cookie, jsl_cookie_key: second_decrypt_cookie} return cookies, None, True