1234567891011121314151617181920212223242526272829303132333435363738 |
- import scrapy
- from scrapy.cmdline import execute
- import os
- import sys
- class CustomSpider(scrapy.Spider):
- name = 'customs_gov'
- allowed_domains = ['www.customs.gov.cn']
- # 显式定义入口请求(网页6、网页7)
- def start_requests(self):
- urls = [
- 'http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html'
- ]
- for url in urls:
- yield scrapy.Request(
- url=url,
- callback=self.parse,
- headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
- 'Referer': 'https://www.customs.gov.cn/'
- }
- )
- def parse(self, response):
- # 调试响应状态(网页4)
- if response.status == 200:
- print("===== 响应内容片段 =====")
- print(response.text[:1000])
- else:
- self.logger.error(f"请求失败,状态码: {response.status}")
- # 添加IDE直接运行入口(网页1、网页3)
- if __name__ == "__main__":
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
- execute(['scrapy', 'crawl', 'customs_gov'])
|