wyp
/
crossborder


			
							1234567891011121314151617181920212223242526272829303132333435363738
							import scrapy
from scrapy.cmdline import execute
import os
import sys


class CustomSpider(scrapy.Spider):
    name = 'customs_gov'
    allowed_domains = ['www.customs.gov.cn']

    # 显式定义入口请求（网页6、网页7）
    def start_requests(self):
        urls = [
            'http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html'
        ]
        for url in urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
                    'Referer': 'https://www.customs.gov.cn/'
                }
            )

    def parse(self, response):
        # 调试响应状态（网页4）
        if response.status == 200:
            print("===== 响应内容片段 =====")
            print(response.text[:1000])
        else:
            self.logger.error(f"请求失败，状态码: {response.status}")


# 添加IDE直接运行入口（网页1、网页3）
if __name__ == "__main__":
    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
    execute(['scrapy', 'crawl', 'customs_gov'])