ScrpyDownload.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import scrapy
  2. from scrapy.cmdline import execute
  3. import os
  4. import sys
  5. class CustomSpider(scrapy.Spider):
  6. name = 'customs_gov'
  7. allowed_domains = ['www.customs.gov.cn']
  8. # 显式定义入口请求(网页6、网页7)
  9. def start_requests(self):
  10. urls = [
  11. 'http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html'
  12. ]
  13. for url in urls:
  14. yield scrapy.Request(
  15. url=url,
  16. callback=self.parse,
  17. headers={
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
  19. 'Referer': 'https://www.customs.gov.cn/'
  20. }
  21. )
  22. def parse(self, response):
  23. # 调试响应状态(网页4)
  24. if response.status == 200:
  25. print("===== 响应内容片段 =====")
  26. print(response.text[:1000])
  27. else:
  28. self.logger.error(f"请求失败,状态码: {response.status}")
  29. # 添加IDE直接运行入口(网页1、网页3)
  30. if __name__ == "__main__":
  31. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  32. execute(['scrapy', 'crawl', 'customs_gov'])