Ver código fonte

crawl zhejiang fix

zhangfan 1 mês atrás
pai
commit
cd1bbf0b8d
1 arquivos alterados com 60 adições e 3 exclusões
  1. 60 3
      crossborder/zhejiang/crawl_gov_zhejiang_full.py

+ 60 - 3
crossborder/zhejiang/crawl_gov_zhejiang_full.py

@@ -85,6 +85,62 @@ def crawl_by_year_tabs(driver, base_url, year_month):
         driver.close()
         driver.switch_to.window(driver.window_handles[0])
 
+
+def get_current_and_previous_month(text):
+    """
+    将类似 "2025年-五月" 或 "2025年十二月" 的字符串解析为:
+    - 当前年份
+    - 当前中文月份
+    - 上一个月的 (年份, 中文月份)
+
+    :param text: 输入文本,如 "2025年-五月"
+    :return: tuple(current_year, current_month, previous_year, previous_month)
+    """
+
+    # 中文月份映射表
+    month_map = {
+        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6,
+        '七': 7, '八': 8, '九': 9, '十': 10, '十一': 11, '十二': 12
+    }
+
+    reverse_month_map = {v: k for k, v in month_map.items()}
+
+    # 提取年份和中文月份
+    match = re.search(r'(\d{4})年[-\s]?([一二三四五六七八九十][一|二]?月)', text)
+    if not match:
+        raise ValueError(f"无法从 '{text}' 提取有效的年份和月份")
+
+    year_str = match.group(1)
+    chinese_month_str = match.group(2).replace('月', '')
+
+    # 处理中文月份转数字
+    if chinese_month_str in month_map:
+        current_month_num = month_map[chinese_month_str]
+    elif chinese_month_str == '十月':
+        current_month_num = 10
+    else:
+        raise ValueError(f"不支持的中文月份格式: {chinese_month_str}")
+
+    current_year_num = int(year_str)
+
+    # 计算上个月
+    if current_month_num > 1:
+        previous_year_num = current_year_num
+        previous_month_num = current_month_num - 1
+    else:
+        previous_year_num = current_year_num - 1
+        previous_month_num = 12
+
+    # 转换为中文月份
+    previous_month_chinese = reverse_month_map[previous_month_num]
+
+    return (
+        f"{current_year_num}年",
+        chinese_month_str + '月',
+        f"{previous_year_num}年",
+        previous_month_chinese + '月'
+    )
+
 def process_month_tabs(driver, year, base_url, year_month):
     """处理月份Tab导航(动态获取真实存在的月份)"""
     # 显式等待容器加载
@@ -97,6 +153,8 @@ def process_month_tabs(driver, year, base_url, year_month):
     processed_months = set()  # 已处理月份记录
     retry_count = 0
 
+    y1, m1, y2, m2 = get_current_and_previous_month(year_month)
+
     while retry_count < 3:
         try:
             # 全量获取所有月份Tab
@@ -120,11 +178,10 @@ def process_month_tabs(driver, year, base_url, year_month):
 
                 log.info(f"点击月份Tab:{year}-{month_text}")
                 if year_month is not None:
-                    tar_year, tar_month = year_month.split('-')[0], year_month.split('-')[1]
-                    if tar_year != year:
+                    if y1 != year or y2 != year:
                         retry_count += 1
                         break
-                    if tar_month != month_text:
+                    if not (y1 == year and m1 == month_text) and not (y2 == year and m2 == month_text):
                         log.info(f"{year}年 {month_text} 月份跳过, auto tar: {year_month}")
                         continue
                 a_tag.click()