|
@@ -11,7 +11,9 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
from crossborder.fujian.fujian_parse_excel import parse_excel
|
|
from crossborder.fujian.fujian_parse_excel import parse_excel
|
|
from crossborder.utils.constants import DOWNLOAD_DIR
|
|
from crossborder.utils.constants import DOWNLOAD_DIR
|
|
|
|
+from crossborder.utils.db_helper import DBHelper
|
|
from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
|
|
from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
|
|
|
|
+from crossborder.utils.log import log
|
|
from crossborder.utils.parse_utils import traverse_and_process
|
|
from crossborder.utils.parse_utils import traverse_and_process
|
|
|
|
|
|
# 基础配置
|
|
# 基础配置
|
|
@@ -36,10 +38,10 @@ def detect_latest_month(driver):
|
|
WebDriverWait(driver, 10).until(
|
|
WebDriverWait(driver, 10).until(
|
|
EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
|
|
EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
|
|
)
|
|
)
|
|
- print(f"已找到最新月份数据 {check_year}-{check_month}")
|
|
|
|
|
|
+ log.info(f"已找到最新月份数据 {check_year}-{check_month}")
|
|
return check_year, check_month
|
|
return check_year, check_month
|
|
except:
|
|
except:
|
|
- print(f"未找到 {target_title}")
|
|
|
|
|
|
+ log.info(f"未找到 {target_title}")
|
|
continue
|
|
continue
|
|
raise Exception("三个月内未找到有效数据")
|
|
raise Exception("三个月内未找到有效数据")
|
|
|
|
|
|
@@ -71,15 +73,15 @@ def process_month_data(driver, year, month):
|
|
time.sleep(random.uniform(0.5, 1.5)) # 成功后等待
|
|
time.sleep(random.uniform(0.5, 1.5)) # 成功后等待
|
|
except Exception as e:
|
|
except Exception as e:
|
|
retry += 1
|
|
retry += 1
|
|
- print(f"下载 {title} 失败(第{retry}次重试): {str(e)}")
|
|
|
|
|
|
+ log.error(f"下载 {title} 失败(第{retry}次重试): {str(e)}")
|
|
if retry >= MAX_RETRY:
|
|
if retry >= MAX_RETRY:
|
|
- print(f"❌ 超出最大重试次数,跳过该文件:{title}")
|
|
|
|
|
|
+ log.error(f"❌ 超出最大重试次数,跳过该文件:{title}")
|
|
return 1000
|
|
return 1000
|
|
else:
|
|
else:
|
|
- print(f"🔄 第{retry}次重试:{title}")
|
|
|
|
|
|
+ log.error(f"🔄 第{retry}次重试:{title}")
|
|
time.sleep(random.uniform(2, 4)) # 重试前随机等待
|
|
time.sleep(random.uniform(2, 4)) # 重试前随机等待
|
|
|
|
|
|
- print(f"本页找到{found_count}个有效表格")
|
|
|
|
|
|
+ log.info(f"本页找到{found_count}个有效表格")
|
|
return found_count
|
|
return found_count
|
|
|
|
|
|
|
|
|
|
@@ -89,7 +91,7 @@ def reverse_crawler(driver, target_months):
|
|
# target_months = [(2023, 5), (2023, 4)]
|
|
# target_months = [(2023, 5), (2023, 4)]
|
|
page = 1
|
|
page = 1
|
|
for year, month in target_months:
|
|
for year, month in target_months:
|
|
- print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
|
|
|
|
|
|
+ log.info(f"\n开始处理 {year}年{month}月数据".center(50, "="))
|
|
|
|
|
|
WebDriverWait(driver, 15).until(
|
|
WebDriverWait(driver, 15).until(
|
|
EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
|
|
EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
|
|
@@ -104,22 +106,22 @@ def reverse_crawler(driver, target_months):
|
|
|
|
|
|
try:
|
|
try:
|
|
# 动态检测当前页面月份
|
|
# 动态检测当前页面月份
|
|
- print(f"当前页面:{driver.current_url}, 第{page}页")
|
|
|
|
|
|
+ log.info(f"当前页面:{driver.current_url}, 第{page}页")
|
|
|
|
|
|
# 处理当前页面的表格数据
|
|
# 处理当前页面的表格数据
|
|
found = process_month_data(driver, year, month)
|
|
found = process_month_data(driver, year, month)
|
|
found_tables += found
|
|
found_tables += found
|
|
|
|
|
|
if found_tables == 1000:
|
|
if found_tables == 1000:
|
|
- print(f"❌{year}年{month}月数据采集失败,跳过当前月")
|
|
|
|
|
|
+ log.error(f"❌{year}年{month}月数据采集失败,跳过当前月")
|
|
break
|
|
break
|
|
|
|
|
|
# 完成四个表格采集
|
|
# 完成四个表格采集
|
|
if found_tables >= 1:
|
|
if found_tables >= 1:
|
|
- print(f"已完成{year}年{month}月全部表格采集")
|
|
|
|
|
|
+ log.info(f"已完成{year}年{month}月全部表格采集")
|
|
processed_months.add((year, month))
|
|
processed_months.add((year, month))
|
|
break
|
|
break
|
|
- print(f"第{page}页已采集表格数:{found_tables}/1,前往下一页采集")
|
|
|
|
|
|
+ log.info(f"第{page}页已采集表格数:{found_tables}/1,前往下一页采集")
|
|
# 分页操作(增强定位稳定性)
|
|
# 分页操作(增强定位稳定性)
|
|
WebDriverWait(driver, 15).until(
|
|
WebDriverWait(driver, 15).until(
|
|
EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
|
|
EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
|
|
@@ -130,10 +132,10 @@ def reverse_crawler(driver, target_months):
|
|
|
|
|
|
|
|
|
|
except TimeoutException:
|
|
except TimeoutException:
|
|
- print(f"未找到更多分页,已采集表格数:{found_tables}/1")
|
|
|
|
|
|
+ log.error(f"未找到更多分页,已采集表格数:{found_tables}/1")
|
|
break
|
|
break
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- print(f"分页异常:{str(e)}")
|
|
|
|
|
|
+ log.error(f"分页异常:{str(e)}")
|
|
handle_retry(driver) # 异常恢复函数
|
|
handle_retry(driver) # 异常恢复函数
|
|
break
|
|
break
|
|
|
|
|
|
@@ -153,9 +155,9 @@ def handle_retry(driver):
|
|
WebDriverWait(driver, 15).until(
|
|
WebDriverWait(driver, 15).until(
|
|
EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
|
|
EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
|
|
)
|
|
)
|
|
- print("浏览器异常已恢复")
|
|
|
|
|
|
+ log.error("浏览器异常已恢复")
|
|
except:
|
|
except:
|
|
- print("需要人工干预的严重错误")
|
|
|
|
|
|
+ log.error("需要人工干预的严重错误")
|
|
raise
|
|
raise
|
|
|
|
|
|
|
|
|
|
@@ -169,7 +171,7 @@ def main():
|
|
try:
|
|
try:
|
|
# 智能检测最新有效月份
|
|
# 智能检测最新有效月份
|
|
valid_year, valid_month = detect_latest_month(driver)
|
|
valid_year, valid_month = detect_latest_month(driver)
|
|
- print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
|
|
|
|
|
|
+ log.info(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
|
|
|
|
|
|
# 生成目标序列
|
|
# 生成目标序列
|
|
if args.year:
|
|
if args.year:
|
|
@@ -185,15 +187,18 @@ def main():
|
|
# 未指定年份时:取最近两个月
|
|
# 未指定年份时:取最近两个月
|
|
target_months = generate_month_sequence(valid_year, valid_month)
|
|
target_months = generate_month_sequence(valid_year, valid_month)
|
|
|
|
|
|
- print(f"目标采集月份序列:{target_months}")
|
|
|
|
|
|
+ log.info(f"目标采集月份序列:{target_months}")
|
|
reverse_crawler(driver, target_months)
|
|
reverse_crawler(driver, target_months)
|
|
- print(f"{len(target_months)}个月份数据已采集完毕")
|
|
|
|
|
|
+ log.info(f"{len(target_months)}个月份数据已采集完毕")
|
|
|
|
|
|
finally:
|
|
finally:
|
|
if 'driver' in locals():
|
|
if 'driver' in locals():
|
|
driver.quit()
|
|
driver.quit()
|
|
- print("\n数据清洗入库中...")
|
|
|
|
|
|
+ log.info("\n数据清洗入库中...")
|
|
traverse_and_process(download_dir, parse_excel, province_name="fujian")
|
|
traverse_and_process(download_dir, parse_excel, province_name="fujian")
|
|
|
|
+ log.info("\n福建省地级市数据同比更新中...")
|
|
|
|
+ db_helper = DBHelper()
|
|
|
|
+ db_helper.update_prov_yoy("福建省")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|