selenium_shandong_download.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. import argparse
  2. import random
  3. import time
  4. from datetime import datetime, timedelta
  5. from selenium import webdriver
  6. from selenium.common import TimeoutException
  7. from selenium.webdriver.common.by import By
  8. from selenium.webdriver.support import expected_conditions as EC
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from utils.db_helper import DBHelper
  11. from shandong.shandong_parse_excel import parse_excel
  12. from utils.constants import DOWNLOAD_DIR
  13. from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
  14. from utils.log import log
  15. from utils.parse_utils import traverse_and_process
  16. # 基础配置
  17. MAX_RETRY = 3
  18. BASE_URL = "http://qingdao.customs.gov.cn/qingdao_customs/406535/fdzdgknr30/406514/406515/index.html"
  19. download_dir = DOWNLOAD_DIR / "shandong"
  20. def detect_latest_month(driver):
  21. """三级回溯智能检测最新有效月份"""
  22. driver.get(BASE_URL)
  23. current_date = datetime.now()
  24. for offset in range(0, 3):
  25. check_date = current_date - timedelta(days=offset * 30)
  26. check_year = check_date.year
  27. check_month = check_date.month
  28. target_title = f"{check_year}年{check_month}月山东省进出口主要国别(地区)总值"
  29. try:
  30. WebDriverWait(driver, 10).until(
  31. EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
  32. )
  33. log.info(f"已找到最新月份数据 {check_year}-{check_month}")
  34. return check_year, check_month
  35. except:
  36. log.error(f"未找到 {target_title}")
  37. continue
  38. raise Exception("三个月内未找到有效数据")
  39. def process_month_data(driver, year, month):
  40. required_titles = [
  41. f"{year}年{month}月山东省进口20位主要商品总值",
  42. f"{year}年{month}月山东省出口20位主要商品总值",
  43. f"{year}年{month}月山东省各地市进出口总值",
  44. f"{year}年{month}月山东省进出口主要国别(地区)总值"
  45. ]
  46. found_count = 0
  47. links = driver.find_elements(By.XPATH, '//a[contains(@title,"山东省")]')
  48. for link in links:
  49. title = link.get_attribute("title")
  50. if title in required_titles:
  51. retry = 0
  52. success = False
  53. while retry < MAX_RETRY and not success:
  54. try:
  55. url = link.get_attribute("href")
  56. download_excel(driver, url, year, month, title, download_dir)
  57. found_count += 1
  58. time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
  59. success = True # 成功则跳出循环
  60. except Exception as e:
  61. retry += 1
  62. log.error(f"下载 {title} 失败(第{retry}次重试): {e}")
  63. if retry < MAX_RETRY:
  64. time.sleep(random.uniform(2, 5)) # 随机等待后再试
  65. else:
  66. log.error(f"{title} 下载已达到最大重试次数,跳过该文件。")
  67. log.info(f"本页处理完成,找到{found_count}个有效表格")
  68. return found_count
  69. def reverse_crawler(driver, target_months):
  70. """逆向分页抓取核心(优化分页逻辑)"""
  71. processed_months = set()
  72. # target_months = [(2023, 5), (2023, 4)]
  73. page = 1
  74. for year, month in target_months:
  75. log.info(f"开始处理 {year}年{month}月数据".center(55, "="))
  76. WebDriverWait(driver, 15).until(
  77. EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
  78. )
  79. current_page = 1
  80. found_tables = 0
  81. while True:
  82. # 智能等待页面稳定
  83. random_sleep(base=2, variance=3)
  84. try:
  85. # 动态检测当前页面月份
  86. # page_year, page_month = extract_page_date(driver)
  87. log.info(f"当前页面:{driver.current_url}, 第{page}页")
  88. #
  89. # # 月份不匹配时中断循环
  90. # if (page_year, page_month) != (year, month):
  91. # break
  92. # 处理当前页面的表格数据
  93. found = process_month_data(driver, year, month)
  94. found_tables += found
  95. # 完成四个表格采集
  96. if found_tables >= 4:
  97. log.info(f"已完成{year}年{month}月全部表格采集")
  98. processed_months.add((year, month))
  99. break
  100. log.info(f"第{page}页已采集表格数:{found_tables}/4,前往下一页采集")
  101. # 分页操作(增强定位稳定性)
  102. WebDriverWait(driver, 15).until(
  103. EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
  104. ).click()
  105. current_page += 1
  106. page += 1
  107. except TimeoutException:
  108. log.error(f"未找到更多分页,已采集表格数:{found_tables}/4")
  109. break
  110. except Exception as e:
  111. log.error(f"分页异常:{str(e)}")
  112. handle_retry(driver) # 异常恢复函数
  113. break
  114. return processed_months
  115. def random_sleep(base=2, variance=5):
  116. """智能随机等待"""
  117. sleep_time = base + random.random() * variance
  118. time.sleep(sleep_time)
  119. def handle_retry(driver):
  120. """异常恢复处理"""
  121. try:
  122. driver.refresh()
  123. WebDriverWait(driver, 15).until(
  124. EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
  125. )
  126. log.warning("浏览器异常已恢复")
  127. except:
  128. log.error("需要人工干预的严重错误")
  129. raise
  130. def main():
  131. """主入口(优化参数处理逻辑)"""
  132. parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
  133. parser.add_argument('--year', type=int, default=None,
  134. help='终止年份(如2023),未指定时抓取最新两个月')
  135. args = parser.parse_args()
  136. log.info("【山东海关】数据抓取开始".center(66, "*"))
  137. driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
  138. try:
  139. # 智能检测最新有效月份
  140. valid_year, valid_month = detect_latest_month(driver)
  141. log.info(f"【山东海关】最新数据:{valid_year}年{valid_month:02d}月")
  142. # 生成目标序列
  143. if args.year:
  144. # 指定年份时:从最新月到目标年1月
  145. target_months = generate_month_sequence(
  146. start_year=valid_year,
  147. start_month=valid_month,
  148. end_year=args.year
  149. )
  150. else:
  151. # 未指定年份时:取最近两个月
  152. target_months = generate_month_sequence(valid_year, valid_month)
  153. log.info(f"【山东海关】目标采集月份序列:{target_months}")
  154. reverse_crawler(driver, target_months)
  155. log.info(f"{len(target_months)}个月份数据已采集完毕")
  156. finally:
  157. driver.quit()
  158. log.info("【山东海关】数据抓取结束".center(66, "*"))
  159. log.info("\n山东省数据清洗入库中...")
  160. traverse_and_process(download_dir, parse_excel, province_name="shandong")
  161. log.info("\n山东省地级市数据同比更新中...")
  162. db_helper = DBHelper()
  163. db_helper.update_prov_yoy("山东省")
  164. if __name__ == "__main__":
  165. main()