|
@@ -2,6 +2,8 @@ import os
|
|
|
import random
|
|
|
import re
|
|
|
import time
|
|
|
+import sys
|
|
|
+from datetime import datetime, timedelta
|
|
|
from pathlib import Path
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
@@ -51,7 +53,7 @@ def configure_stealth_options():
|
|
|
opts.add_argument("--headless")
|
|
|
return opts
|
|
|
|
|
|
-def crawl_by_year_tabs(driver, base_url):
|
|
|
+def crawl_by_year_tabs(driver, base_url, year_month):
|
|
|
"""按年份Tab导航爬取数据"""
|
|
|
years = ['2023年', '2024年', '2025年']
|
|
|
WebDriverWait(driver, 30).until(
|
|
@@ -74,15 +76,15 @@ def crawl_by_year_tabs(driver, base_url):
|
|
|
driver.switch_to.window(driver.window_handles[-1])
|
|
|
log.info(f"\n正在处理 {year_text} 年份页面")
|
|
|
|
|
|
- process_month_tabs(driver, year_text, base_url)
|
|
|
+ process_month_tabs(driver, year_text, base_url, year_month)
|
|
|
|
|
|
# 返回主窗口
|
|
|
driver.close()
|
|
|
driver.switch_to.window(driver.window_handles[0])
|
|
|
|
|
|
-def process_month_tabs(driver, year, base_url):
|
|
|
+def process_month_tabs(driver, year, base_url, year_month):
|
|
|
"""处理月份Tab导航(动态获取真实存在的月份)"""
|
|
|
- # ✅ 显式等待容器加载
|
|
|
+ # 显式等待容器加载
|
|
|
WebDriverWait(driver, 30).until(
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
|
|
|
)
|
|
@@ -92,8 +94,7 @@ def process_month_tabs(driver, year, base_url):
|
|
|
processed_months = set() # 已处理月份记录
|
|
|
retry_count = 0
|
|
|
|
|
|
- # while retry_count < 3: # 最多重试3次
|
|
|
- while True: # 最多重试3次
|
|
|
+ while retry_count < 3:
|
|
|
try:
|
|
|
# 全量获取所有月份Tab
|
|
|
month_items = driver.find_elements(By.XPATH, '//ul[@class="nav_tab"]//li')
|
|
@@ -115,6 +116,14 @@ def process_month_tabs(driver, year, base_url):
|
|
|
continue # 跳过已处理月份
|
|
|
|
|
|
log.info(f"点击月份Tab:{year}-{month_text}")
|
|
|
+ if year_month is not None:
|
|
|
+ tar_year, tar_month = year_month.split('-')[0], year_month.split('-')[1]
|
|
|
+ if tar_year != year:
|
|
|
+ retry_count += 1
|
|
|
+ break
|
|
|
+ if tar_month != month_text:
|
|
|
+ log.info(f"{year}年 {month_text} 月份跳过, increment tar: {year_month}")
|
|
|
+ continue
|
|
|
a_tag.click()
|
|
|
|
|
|
# 处理详情页逻辑
|
|
@@ -145,13 +154,10 @@ def process_month_tabs(driver, year, base_url):
|
|
|
break
|
|
|
else:
|
|
|
# 部分月份未找到,重新获取元素
|
|
|
- # retry_count += 1
|
|
|
log.info(f"第 {retry_count} 次重试获取月份Tab...")
|
|
|
- time.sleep(2)
|
|
|
|
|
|
except StaleElementReferenceException:
|
|
|
log.info("页面刷新,重新获取月份Tab列表...")
|
|
|
- # retry_count += 1
|
|
|
time.sleep(2)
|
|
|
|
|
|
log.info(f"{year}年最终处理的月份:{processed_months}")
|
|
@@ -234,10 +240,60 @@ def extract_year_and_month(file_name):
|
|
|
else:
|
|
|
raise ValueError(f"无法从文件名中提取年份和月份:{file_name}")
|
|
|
|
|
|
-def crawl_with_selenium(url):
|
|
|
+
|
|
|
+def convert_to_chinese_uppercase(num):
|
|
|
+ if not 1 <= num <= 12:
|
|
|
+ return None # 超出范围的数字返回 None 或根据需要处理
|
|
|
+ if num < 10:
|
|
|
+ return '零一二三四五六七八九'[num]
|
|
|
+ elif num == 10:
|
|
|
+ return '十'
|
|
|
+ elif num == 11:
|
|
|
+ return '十一'
|
|
|
+ elif num == 12:
|
|
|
+ return '十二'
|
|
|
+ return None
|
|
|
+
|
|
|
+def detect_latest_month(driver, url):
|
|
|
+ driver.get(url)
|
|
|
+ current_date = datetime.now()
|
|
|
+ for offset in range(0, 3):
|
|
|
+ check_date = current_date - timedelta(days=offset * 30)
|
|
|
+ check_year = check_date.year
|
|
|
+ month = check_date.month
|
|
|
+ check_month = convert_to_chinese_uppercase(month)
|
|
|
+
|
|
|
+ target_title = f"{check_month}月"
|
|
|
+ try:
|
|
|
+ WebDriverWait(driver, 10).until(
|
|
|
+ EC.presence_of_element_located((By.XPATH, f'//ul[@class="nav_tab"]//li/a[normalize-space()="{target_title}"]'))
|
|
|
+ )
|
|
|
+ log.info(f"已找到最新月份数据 {check_year}-{check_month}")
|
|
|
+ # 看是否已存表,已存则跳过;
|
|
|
+ count = base_mysql.get_code_exist(f'{check_year}-{month:02d}', '330000')
|
|
|
+ if count > 0:
|
|
|
+ log.info(f"count: {count} -> 已存在 {check_year}-{check_month} 数据,跳过")
|
|
|
+ continue
|
|
|
+ return f"{check_year}年-{check_month}月"
|
|
|
+ except:
|
|
|
+ log.info(f"未找到 {target_title}")
|
|
|
+ continue
|
|
|
+ log.info("三个月内未找到有效数据")
|
|
|
+ return None
|
|
|
+
|
|
|
+def crawl_with_selenium(url, mark):
|
|
|
driver = webdriver.Firefox(options=configure_stealth_options())
|
|
|
- base_url = 'http://hangzhou.customs.gov.cn'
|
|
|
|
|
|
+ year_month = None
|
|
|
+ if 'increment' == mark:
|
|
|
+ res = detect_latest_month(driver, url)
|
|
|
+ if res is None:
|
|
|
+ log.info("浙江省海关没有最新数据更新")
|
|
|
+ sys.exit(0)
|
|
|
+ year_month = res
|
|
|
+ print(f"检测到最新有效数据:{year_month}")
|
|
|
+
|
|
|
+ base_url = 'http://hangzhou.customs.gov.cn'
|
|
|
try:
|
|
|
# 注入反检测脚本
|
|
|
driver.execute_script("""
|
|
@@ -251,7 +307,7 @@ def crawl_with_selenium(url):
|
|
|
driver.get(url)
|
|
|
|
|
|
# 按年份导航
|
|
|
- crawl_by_year_tabs(driver, base_url)
|
|
|
+ crawl_by_year_tabs(driver, base_url, year_month)
|
|
|
|
|
|
finally:
|
|
|
driver.quit()
|
|
@@ -316,7 +372,8 @@ def hierarchical_traversal(root_path):
|
|
|
gov_commodity_zhejiang_city.process_folder(md['path'])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html')
|
|
|
+ # crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
|
|
|
+ crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'increment')
|
|
|
log.info(f"浙江杭州海关全量数据下载任务完成")
|
|
|
# 等待5s后执行
|
|
|
time.sleep(5)
|