Ver Fonte

crawl remove sql file

zhangfan há 1 mês atrás
pai
commit
a2ec9ab246

+ 14 - 6
crossborder/anhui/crawl_gov_anhui_full.py

@@ -17,8 +17,9 @@ from crossborder.anhui import gov_commodity_anhui_city, download_dir
 from crossborder.anhui import gov_commodity_anhui_country
 from crossborder.anhui import gov_commodity_anhui_import_export
 from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
-from crossborder.utils.log import get_logger
+from crossborder.utils.log import  get_logger
 
 log = get_logger(__name__)
 
@@ -254,7 +255,7 @@ def crawl_with_selenium(url, mark):
         print(f"安徽省合肥海关全量数据下载任务完成")
         # 等待5s后执行
         time.sleep(5)
-        hierarchical_traversal(download_dir)
+        hierarchical_traversal(download_dir, year_month)
         print("安徽省海关类章、国家、城市所有文件处理完成!")
         time.sleep(5)
         base_mysql.update_shandong_yoy('安徽省')
@@ -291,7 +292,7 @@ def wait_for_download_complete(timeout=30, existing_files=None):
         time.sleep(2)
     raise TimeoutError("文件下载超时")
 
-def hierarchical_traversal(root_path):
+def hierarchical_traversal(root_path, year_month):
     """分层遍历:省份->年份->月目录"""
     root = Path(root_path)
     # 获取所有年份目录
@@ -317,9 +318,16 @@ def hierarchical_traversal(root_path):
         if month_dirs:
             for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
                 print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
-                gov_commodity_anhui_import_export.process_folder(md['path'])
-                gov_commodity_anhui_country.process_folder(md['path'])
-                gov_commodity_anhui_city.process_folder(md['path'])
+                path = md['path']
+                if year_month is not None:
+                    year, month = extract_year_month(year_month)
+                    parts = path.parts
+                    if year_dir.name != year or parts[-1] != month:
+                        log.info(f"安徽省海关已处理 {year_month} 数据,返回")
+                        return
+                gov_commodity_anhui_import_export.process_folder(path)
+                gov_commodity_anhui_country.process_folder(path)
+                gov_commodity_anhui_city.process_folder(path)
 
 def main():
     try:

+ 14 - 9
crossborder/hebei/crawl_gov_hebei_full.py

@@ -18,7 +18,7 @@ from crossborder.hebei import gov_commodity_hebei_city
 from crossborder.hebei import gov_commodity_hebei_country
 from crossborder.hebei import gov_commodity_hebei_import_export
 from crossborder.utils import base_country_code, base_mysql
-from crossborder.utils.base_country_code import get_last_month
+from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
 from crossborder.utils.log import  get_logger
 
@@ -181,8 +181,7 @@ def crawl_with_selenium(url, mark):
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("河北省海关没有最新数据更新")
-            # sys.exit(0)
-            return
+            return None
         year_month = res
         print(f"检测到最新有效数据:{year_month}")
 
@@ -232,7 +231,7 @@ def crawl_with_selenium(url, mark):
         driver.quit()
         # 等待5s后执行
         time.sleep(5)
-        hierarchical_traversal(download_dir)
+        hierarchical_traversal(download_dir, year_month)
         log.info(f"河北省海关全量数据下载任务完成")
         time.sleep(5)
         base_mysql.update_shandong_yoy('河北省')
@@ -269,7 +268,7 @@ def wait_for_download_complete(timeout=30, existing_files=None):
     raise TimeoutError("文件下载超时")
 
 
-def hierarchical_traversal(root_path):
+def hierarchical_traversal(root_path, year_month):
     """分层遍历:省份->年份->月目录"""
     root = Path(root_path)
     # 获取所有年份目录
@@ -295,10 +294,16 @@ def hierarchical_traversal(root_path):
         if month_dirs:
             for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
                 log.info(f"  月份:{md['month']:02d} | 路径:{md['path']}")
-                gov_commodity_hebei_import_export.process_folder(md['path'])
-                gov_commodity_hebei_country.process_folder(md['path'])
-                gov_commodity_hebei_city.process_folder(md['path'])
-
+                path = md['path']
+                if year_month is not None:
+                    year, month = extract_year_month(year_month)
+                    parts = path.parts
+                    if year_dir.name != year or parts[-1] != month:
+                        log.info(f"河北省海关已处理 {year_month} 数据,返回")
+                        return
+                gov_commodity_hebei_import_export.process_folder(path)
+                gov_commodity_hebei_country.process_folder(path)
+                gov_commodity_hebei_city.process_folder(path)
 
 def main():
     try:

+ 14 - 9
crossborder/jiangsu/crawl_gov_jiangsu_full.py

@@ -7,7 +7,6 @@ import time
 import rarfile
 import shutil
 from pathlib import Path
-import sys
 from datetime import datetime, timedelta
 
 from faker import Faker
@@ -23,7 +22,7 @@ from crossborder.jiangsu import gov_commodity_jiangsu_city
 from crossborder.jiangsu import gov_commodity_jiangsu_import_export
 
 from crossborder.utils import base_country_code, base_mysql
-from crossborder.utils.base_country_code import get_last_month
+from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
 from crossborder.utils.log import  get_logger
 
@@ -222,8 +221,7 @@ def crawl_with_selenium(url, mark):
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("江苏省海关没有最新数据更新")
-            # sys.exit(0)
-            return
+            return None
         year_month = res
         print(f"检测到最新有效数据:{year_month}")
 
@@ -276,7 +274,7 @@ def crawl_with_selenium(url, mark):
         # 等待5s后执行
         time.sleep(5)
         all_records = base_mysql.get_hs_all()
-        hierarchical_traversal(download_dir, all_records)
+        hierarchical_traversal(download_dir, all_records, year_month)
         log.info("江苏省海关类章、国家、城市所有文件处理完成!")
         time.sleep(5)
         base_mysql.update_shandong_yoy('江苏省')
@@ -307,7 +305,7 @@ def wait_for_download_complete(timeout=30, existing_files=None):
         time.sleep(2)
     raise TimeoutError("未找到 .rar 文件或超时")
 
-def hierarchical_traversal(root_path, all_records):
+def hierarchical_traversal(root_path, all_records, year_month):
     """分层遍历:省份->年份->月目录"""
     root = Path(root_path)
     # 获取所有年份目录
@@ -333,9 +331,16 @@ def hierarchical_traversal(root_path, all_records):
         if month_dirs:
             for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
                 log.info(f"  月份:{md['month']:02d} | 路径:{md['path']}")
-                gov_commodity_jiangsu_import_export.process_folder(md['path'], all_records)
-                gov_commodity_jiangsu_country.process_folder(md['path'])
-                gov_commodity_jiangsu_city.process_folder(md['path'])
+                path = md['path']
+                if year_month is not None:
+                    year, month = extract_year_month(year_month)
+                    parts = path.parts
+                    if year_dir.name != year or parts[-1] != month:
+                        log.info(f"江苏省海关已处理 {year_month} 数据,返回")
+                        return
+                gov_commodity_jiangsu_import_export.process_folder(path, all_records)
+                gov_commodity_jiangsu_country.process_folder(path)
+                gov_commodity_jiangsu_city.process_folder(path)
 
 def main():
     try:

+ 9 - 12
crossborder/utils/base_country_code.py

@@ -330,17 +330,14 @@ def extract_year_month_from_path(path):
     except IndexError:
         raise ValueError("路径结构不符合要求,示例:.../shandong/2025/04")
 
-def get_last_month():
-    # 获取当前时间
-    today = datetime.today()
+def extract_year_month(text):
+    # 使用正则表达式匹配 "YYYY年M月" 或 "YYYY年MM月" 的格式
+    match = re.search(r"(\d{4})年(\d{1,2})月", text)
 
-    # 计算上个月
-    if today.month == 1:
-        # 如果是1月,则上个月是去年12月
-        last_month = today.replace(year=today.year - 1, month=12)
-    else:
-        # 否则,直接减去一个月
-        last_month = today.replace(month=today.month - 1)
+    if match:
+        year = match.group(1)  # 提取年份
+        month = match.group(2).zfill(2)  # 提取月份并补零
 
-    # 返回格式为 yyyy-MM 的字符串
-    return last_month.strftime('%Y-%m')
+        return year, month
+    else:
+        raise ValueError("无法从文本中提取年份和月份")

+ 14 - 8
crossborder/zhejiang/crawl_gov_zhejiang_full.py

@@ -16,7 +16,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from crossborder.utils.base_country_code import get_last_month
+from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
 from crossborder.zhejiang import download_dir
 from crossborder.zhejiang import gov_commodity_zhejiang_city
@@ -292,8 +292,7 @@ def crawl_with_selenium(url, mark):
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("浙江省海关没有最新数据更新")
-            # sys.exit(0)
-            return
+            return None
         year_month = res
         print(f"检测到最新有效数据:{year_month}")
 
@@ -318,7 +317,7 @@ def crawl_with_selenium(url, mark):
         log.info(f"浙江省海关全量数据下载任务完成")
         # 等待5s后执行
         time.sleep(5)
-        hierarchical_traversal(download_dir)
+        hierarchical_traversal(download_dir, year_month)
         log.info("浙江省海关类章、国家、城市所有文件处理完成!")
         time.sleep(5)
         base_mysql.update_shandong_yoy('浙江省')
@@ -354,7 +353,7 @@ def wait_for_download_complete(timeout=30, existing_files=None):
         time.sleep(2)
     raise TimeoutError("文件下载超时")
 
-def hierarchical_traversal(root_path):
+def hierarchical_traversal(root_path, year_month):
     """分层遍历:省份->年份->月目录"""
     root = Path(root_path)
     # 获取所有年份目录
@@ -380,9 +379,16 @@ def hierarchical_traversal(root_path):
         if month_dirs:
             for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
                 log.info(f"  月份:{md['month']:02d} | 路径:{md['path']}")
-                gov_commodity_zhejiang_import_export.process_folder(md['path'])
-                gov_commodity_zhejiang_country.process_folder(md['path'])
-                gov_commodity_zhejiang_city.process_folder(md['path'])
+                path = md['path']
+                if year_month is not None:
+                    year, month = extract_year_month(year_month)
+                    parts = path.parts
+                    if year_dir.name != year or parts[-1] != month:
+                        log.info(f"浙江省海关已处理 {year_month} 数据,返回")
+                        return
+                gov_commodity_zhejiang_import_export.process_folder(path)
+                gov_commodity_zhejiang_country.process_folder(path)
+                gov_commodity_zhejiang_city.process_folder(path)
 
 def main():
     try: