import scrapy from scrapy.cmdline import execute import os import sys class CustomSpider(scrapy.Spider): name = 'customs_gov' allowed_domains = ['www.customs.gov.cn'] # 显式定义入口请求(网页6、网页7) def start_requests(self): urls = [ 'http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html' ] for url in urls: yield scrapy.Request( url=url, callback=self.parse, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 'Referer': 'https://www.customs.gov.cn/' } ) def parse(self, response): # 调试响应状态(网页4) if response.status == 200: print("===== 响应内容片段 =====") print(response.text[:1000]) else: self.logger.error(f"请求失败,状态码: {response.status}") # 添加IDE直接运行入口(网页1、网页3) if __name__ == "__main__": sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(['scrapy', 'crawl', 'customs_gov'])