2 Commit-ok 8f1c1fcb06 ... 8ef73bc173

Szerző SHA1 Üzenet Dátum
  01495251 8ef73bc173 Merge branch 'master' of http://42.192.203.166:3000/wyp/crossborder 1 hónapja
  zhangfan a438020283 1.海关总署年度表,月度表只下载最新月份,数据清洗只执行一次 1 hónapja

+ 0 - 1
crossborder/guangdong/selenium_guangdong_download.py

@@ -435,7 +435,6 @@ def main():
 
     finally:
         driver.quit()
-        log.info("\n数据清洗入库中...")
 
 
 if __name__ == "__main__":

+ 8 - 0
crossborder/quanguo/parse_month_excel.py

@@ -9,6 +9,8 @@ from crossborder.utils.parse_utils import convert_unit, parse_ratio
 
 CURRENT_YEAR = str(datetime.now().year)
 
+_has_executed = False
+
 def is_current_year_data(date_str):
     """
     判断是否为当前年份的数据(如 2025.01)
@@ -69,7 +71,13 @@ def parse_month_table_excel(file_path):
     解析月度汇总表并入库
     :param file_path: Excel 文件路径
     """
+    global _has_executed
+    if _has_executed:
+        return
+    _has_executed = True
+
     log.info(f"月度表数据解析:{file_path}")
+
     db = DBHelper()
     sql_template = """
     INSERT INTO t_yujin_crossborder_monthly_summary 

+ 5 - 4
crossborder/quanguo/parse_year_excel.py

@@ -1,6 +1,7 @@
 from datetime import datetime
 import xlrd
 from crossborder.utils.db_helper import DBHelper
+from crossborder.utils.log import log
 from crossborder.utils.parse_utils import convert_unit, parse_ratio
 
 _parse_executed = False  # 模块级变量,控制执行次数
@@ -29,7 +30,7 @@ def get_upsert_sql():
 def parse_year_table_excel(file):
     global _parse_executed
     if _parse_executed:
-        print("⚠️ parse_year_table_excel 已执行过,不再重复执行")
+        log.info("⚠️ parse_year_table_excel 已执行过,不再重复执行")
         return
 
     db_helper = DBHelper()
@@ -40,7 +41,7 @@ def parse_year_table_excel(file):
         workbook = xlrd.open_workbook(file)
         sheet = workbook.sheet_by_index(0)
     except Exception as e:
-        print(f"文件读取失败: {e}")
+        log.error(f"文件读取失败: {e}")
         return
 
     sql = get_upsert_sql()
@@ -68,10 +69,10 @@ def parse_year_table_excel(file):
     # 使用 DBHelper 执行 SQL 插入
     try:
         affected_rows = db_helper.execute_sql_with_params(sql, params_list)
-        print(f"成功处理 {len(params_list)} 条数据,受影响行数:{affected_rows}")
+        log.info(f"成功处理 {len(params_list)} 条数据,受影响行数:{affected_rows}")
         _parse_executed = True
     except Exception as e:
-        print(f"数据库操作失败: {e}")
+        log.error(f"数据库操作失败: {e}")
         raise
 
 

+ 18 - 11
crossborder/quanguo/selenium_download.py

@@ -18,7 +18,7 @@ from crossborder.utils.log import log
 
 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
 download_dir = DOWNLOAD_DIR / "total"
-
+downloaded_tables = set()  # 已下载的表格名集合
 
 
 
@@ -113,14 +113,13 @@ def go_to_year_page(driver, year):
         return False
 
 
-def crawl_with_selenium(driver, base_url, year, latest_only=False):
+def crawl_with_selenium(driver, year, latest_only=False):
     """主抓取函数"""
-    driver.get(base_url)
-
-    if not go_to_year_page(driver, year):
-        log.warning(f"{year} 页面不可用,跳过")
-        return
-    log.info(f"开始抓取 {year} 年数据:{driver.current_url}")
+    if year < datetime.now().year:
+        if not go_to_year_page(driver, year):
+            log.warning(f"{year} 页面不可用,跳过")
+            return
+    log.info(f"开始抓取 {year} 年数据,当前标题: {driver.title}")
     try:
         while True:
             table = WebDriverWait(driver, 20).until(
@@ -146,7 +145,7 @@ def crawl_with_selenium(driver, base_url, year, latest_only=False):
             time.sleep(random.uniform(1, 3))
 
     except StaleElementReferenceException:
-        log.info("检测到元素失效,自动刷新表格")
+        log.error("检测到元素失效,自动刷新表格")
         driver.refresh()
         WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{year}RMB"))
@@ -158,9 +157,14 @@ def sanitize_filename(filename):
 
 
 def handle_month_data(driver, table_name, month_links, year, latest_only):
+    global downloaded_tables
     main_window = driver.current_window_handle
     for idx, month_data in enumerate(month_links):
         if 1 <= month_data[0] <= 12:
+            # 年度表月度表只下载一次(最新月份数据)
+            if "进出口商品总值表" in table_name and table_name in downloaded_tables:
+                log.info(f"【{table_name}】已下载过,跳过")
+                continue
             # 新标签页策略(防止主页面DOM变更)
             driver.switch_to.window(main_window)
             driver.execute_script(f"window.open('{month_data[1]}', '_blank_{idx}')")
@@ -169,6 +173,8 @@ def handle_month_data(driver, table_name, month_links, year, latest_only):
             month_num, link = month_data
             try:
                download_excel(driver, link, year, month_num, table_name, download_dir)
+               # 下载成功后将表格名加入集合
+               downloaded_tables.add(table_name)
             except Exception as e:
                log.info(f"【异常】下载失败: {str(e)}")
             time.sleep(random.uniform(0.5, 1.5))  # 下载间隔
@@ -190,11 +196,12 @@ if __name__ == "__main__":
     options = configure_stealth_options(download_dir)
     driver = webdriver.Firefox(options=options)
 
+    base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
+    driver.get(base_url)
     try:
         for year in years_to_crawl:
-            base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
             log.info(f"\n【{year}年】开始抓取...".center(66, "-"))
-            crawl_with_selenium(driver, base_url, year=year, latest_only=args.year is None)
+            crawl_with_selenium(driver, year=year, latest_only=args.year is None)
     finally:
         driver.quit()
         log.info("【海关总署】全年数据抓取结束".center(66, "*"))