hace 4 meses · 9d9a21a6f9
--- a/crossborder/zhejiang/crawl_gov_zhejiang_full.py
+++ b/crossborder/zhejiang/crawl_gov_zhejiang_full.py
@@ -284,6 +284,51 @@ def detect_latest_month(driver, url):
 
				     log.error("三个月内未找到有效数据")
			
 
				     return None
			
 
				 
			
 
				+def extract_year_month_chinese(text):
			
 
				+    """
			
 
				+    支持格式：
			
 
				+    - 2025年四月
			
 
				+    - 2025年-四月
			
 
				+    - 2024年十二月
			
 
				+    - 2023年-二月
			
 
				+    """
			
 
				+    # 中文月份映射表
			
 
				+    month_map = {
			
 
				+        '一': '01',
			
 
				+        '二': '02',
			
 
				+        '三': '03',
			
 
				+        '四': '04',
			
 
				+        '五': '05',
			
 
				+        '六': '06',
			
 
				+        '七': '07',
			
 
				+        '八': '08',
			
 
				+        '九': '09',
			
 
				+        '十': '10',
			
 
				+        '十一': '11',
			
 
				+        '十二': '12'
			
 
				+    }
			
 
				+
			
 
				+    # 正则匹配年份和中文月份
			
 
				+    match = re.search(r"(\d{4})年-?([一|二|三|四|五|六|七|八|九|十]{1,2}[一|二]?月)", text)
			
 
				+
			
 
				+    if not match:
			
 
				+        raise ValueError(f"无法从文本中提取年份和月份: {text}")
			
 
				+
			
 
				+    year = match.group(1)  # 提取年份
			
 
				+
			
 
				+    # 提取中文月份并处理成数字
			
 
				+    chinese_month = match.group(2).replace('月', '')
			
 
				+    if chinese_month in month_map:
			
 
				+        month = month_map[chinese_month]
			
 
				+    else:
			
 
				+        # 特殊处理 "十月"
			
 
				+        if chinese_month == '十月':
			
 
				+            month = '10'
			
 
				+        else:
			
 
				+            raise ValueError(f"不支持的中文月份格式: {chinese_month}")
			
 
				+
			
 
				+    return year, month
			
 
				+
			
 
				 def crawl_with_selenium(url, mark):
			
 
				     driver = webdriver.Firefox(options=configure_stealth_options())
			
 
				 
			
@@ -295,6 +340,7 @@ def crawl_with_selenium(url, mark):
 
				             return None
			
 
				         year_month = res
			
 
				         print(f"检测到最新有效数据：{year_month}")
			
 
				+        hierarchical_traversal(download_dir, year_month)
			
 
				 
			
 
				     base_url = 'http://hangzhou.customs.gov.cn'
			
 
				     try:
			
@@ -381,7 +427,7 @@ def hierarchical_traversal(root_path, year_month):
 
				                 log.info(f"  月份：{md['month']:02d} | 路径：{md['path']}")
			
 
				                 path = md['path']
			
 
				                 if year_month is not None:
			
 
				-                    year, month = extract_year_month(year_month)
			
 
				+                    year, month = extract_year_month_chinese(year_month)
			
 
				                     parts = path.parts
			
 
				                     if year_dir.name != year or parts[-1] != month:
			
 
				                         log.info(f"浙江省海关已处理 {year_month} 数据，返回")