|
@@ -85,6 +85,62 @@ def crawl_by_year_tabs(driver, base_url, year_month):
|
|
|
driver.close()
|
|
|
driver.switch_to.window(driver.window_handles[0])
|
|
|
|
|
|
+
|
|
|
+def get_current_and_previous_month(text):
|
|
|
+ """
|
|
|
+ 将类似 "2025年-五月" 或 "2025年十二月" 的字符串解析为:
|
|
|
+ - 当前年份
|
|
|
+ - 当前中文月份
|
|
|
+ - 上一个月的 (年份, 中文月份)
|
|
|
+
|
|
|
+ :param text: 输入文本,如 "2025年-五月"
|
|
|
+ :return: tuple(current_year, current_month, previous_year, previous_month)
|
|
|
+ """
|
|
|
+
|
|
|
+ # 中文月份映射表
|
|
|
+ month_map = {
|
|
|
+ '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6,
|
|
|
+ '七': 7, '八': 8, '九': 9, '十': 10, '十一': 11, '十二': 12
|
|
|
+ }
|
|
|
+
|
|
|
+ reverse_month_map = {v: k for k, v in month_map.items()}
|
|
|
+
|
|
|
+ # 提取年份和中文月份
|
|
|
+ match = re.search(r'(\d{4})年[-\s]?([一二三四五六七八九十][一|二]?月)', text)
|
|
|
+ if not match:
|
|
|
+ raise ValueError(f"无法从 '{text}' 提取有效的年份和月份")
|
|
|
+
|
|
|
+ year_str = match.group(1)
|
|
|
+ chinese_month_str = match.group(2).replace('月', '')
|
|
|
+
|
|
|
+ # 处理中文月份转数字
|
|
|
+ if chinese_month_str in month_map:
|
|
|
+ current_month_num = month_map[chinese_month_str]
|
|
|
+ elif chinese_month_str == '十月':
|
|
|
+ current_month_num = 10
|
|
|
+ else:
|
|
|
+ raise ValueError(f"不支持的中文月份格式: {chinese_month_str}")
|
|
|
+
|
|
|
+ current_year_num = int(year_str)
|
|
|
+
|
|
|
+ # 计算上个月
|
|
|
+ if current_month_num > 1:
|
|
|
+ previous_year_num = current_year_num
|
|
|
+ previous_month_num = current_month_num - 1
|
|
|
+ else:
|
|
|
+ previous_year_num = current_year_num - 1
|
|
|
+ previous_month_num = 12
|
|
|
+
|
|
|
+ # 转换为中文月份
|
|
|
+ previous_month_chinese = reverse_month_map[previous_month_num]
|
|
|
+
|
|
|
+ return (
|
|
|
+ f"{current_year_num}年",
|
|
|
+ chinese_month_str + '月',
|
|
|
+ f"{previous_year_num}年",
|
|
|
+ previous_month_chinese + '月'
|
|
|
+ )
|
|
|
+
|
|
|
def process_month_tabs(driver, year, base_url, year_month):
|
|
|
"""处理月份Tab导航(动态获取真实存在的月份)"""
|
|
|
# 显式等待容器加载
|
|
@@ -97,6 +153,8 @@ def process_month_tabs(driver, year, base_url, year_month):
|
|
|
processed_months = set() # 已处理月份记录
|
|
|
retry_count = 0
|
|
|
|
|
|
+ y1, m1, y2, m2 = get_current_and_previous_month(year_month)
|
|
|
+
|
|
|
while retry_count < 3:
|
|
|
try:
|
|
|
# 全量获取所有月份Tab
|
|
@@ -120,11 +178,10 @@ def process_month_tabs(driver, year, base_url, year_month):
|
|
|
|
|
|
log.info(f"点击月份Tab:{year}-{month_text}")
|
|
|
if year_month is not None:
|
|
|
- tar_year, tar_month = year_month.split('-')[0], year_month.split('-')[1]
|
|
|
- if tar_year != year:
|
|
|
+ if y1 != year or y2 != year:
|
|
|
retry_count += 1
|
|
|
break
|
|
|
- if tar_month != month_text:
|
|
|
+ if not (y1 == year and m1 == month_text) and not (y2 == year and m2 == month_text):
|
|
|
log.info(f"{year}年 {month_text} 月份跳过, auto tar: {year_month}")
|
|
|
continue
|
|
|
a_tag.click()
|