Преглед изворни кода

项目结构、打包脚本修改

01495251 пре 1 дан
родитељ
комит
2b74a547b9

+ 50 - 0
cli.py

@@ -0,0 +1,50 @@
+import argparse
+from importlib import import_module
+import sys
+
+PROVINCE_MODULES = {
+    "shandong": "shandong.selenium_shandong_download",
+    "guangdong": "guangdong.selenium_guangdong_download",
+    "guangdong-city": "guangdong.selenium_guangdong_city",
+    "henan": "henan.selenium_henan_download",
+    "fujian": "fujian.selenium_fujian_download",
+    "anhui": "anhui.crawl_gov_anhui_full",
+    "jiangsu": "jiangsu.gov_commodity_jiangsu_country",
+    "hebei": "hebei.crawl_gov_hebei_full",
+    "zhejiang": "zhejiang.crawl_gov_zhejiang_full",
+    "quanguo": "quanguo.selenium_download"
+}
+
+def run_province(name, year=None):
+    try:
+        module = import_module(PROVINCE_MODULES[name])
+        print(f"✅ 正在运行 {name} 模块...")
+
+        # 构造模拟的 sys.argv
+        fake_argv = ['script_name']
+        if year is not None:
+            fake_argv.extend(['--year', str(year)])
+
+        sys.argv = fake_argv
+        module.main()
+
+    except Exception as e:
+        print(f"❌ {name} 执行失败: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description="跨省数据采集器")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('--province', '-p', choices=PROVINCE_MODULES.keys(), help='指定省份')
+    group.add_argument('--all', action='store_true', help='运行所有省份')
+    parser.add_argument('--year', type=int, help='年份(非必填)')
+
+    args = parser.parse_args()
+
+    if args.province:
+        run_province(args.province, args.year)
+    elif args.all:
+        for name in PROVINCE_MODULES:
+            run_province(name, args.year)
+
+if __name__ == "__main__":
+    main()

+ 1 - 1
fujian/fujian_parse_excel.py

@@ -3,7 +3,7 @@ from pathlib import Path
 
 import pandas as pd
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from utils.constants import DOWNLOAD_DIR
 from utils.parse_utils import convert_wan_to_yuan, extract_year_month_from_path, traverse_and_process
 

+ 1 - 1
guangdong/guangdong_gongbei_parse_excel.py

@@ -3,7 +3,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from quanguo.detail import parse_value
 from utils.constants import GUANGDONG_CITY
 from utils.log import log

+ 2 - 2
guangdong/guangdong_sub_customs_parse_excel.py

@@ -3,7 +3,7 @@ from pathlib import Path
 
 import pandas as pd
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from utils.constants import DOWNLOAD_DIR, GUANGDONG_CITY
 from utils.log import log
 from utils.parse_utils import traverse_and_process, extract_year_month_from_path, get_previous_month_dir
@@ -643,7 +643,7 @@ def parse_excel(current_dir):
         # log.debug(f"处理后数据示例:\n{final_df.head()}")
 
         # 这里调用DBHelper入库(实际使用时请取消注释)
-        from db_helper import DBHelper
+        from utils.db_helper import DBHelper
         db = DBHelper()
         db.bulk_insert(
             final_df,

+ 1 - 4
guangdong/selenium_guangdong_city.py

@@ -10,7 +10,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from guangdong.guangdong_gongbei_parse_excel import parse_region_table, calculate_monthly_data
 from guangdong.guangdong_sub_customs_parse_excel import parse_excel
 from utils.constants import DOWNLOAD_DIR
@@ -202,9 +202,6 @@ def random_sleep(base=2, variance=5):
     time.sleep(sleep_time)
 
 
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-
 # def process_customs(customs_name, args):
 #     """处理单个海关的数据抓取任务"""
 #     options = configure_stealth_options(download_dir)

+ 1 - 2
guangdong/selenium_guangdong_download.py

@@ -1,5 +1,4 @@
 import argparse
-import argparse
 import random
 import re
 import time
@@ -12,7 +11,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from utils.constants import DOWNLOAD_DIR, COUNTRY_CODE_MAPPING
 from utils.download_utils import configure_stealth_options, generate_month_sequence
 from utils.log import log

+ 1 - 1
henan/henan_parse_excel.py

@@ -3,7 +3,7 @@ from pathlib import Path
 
 import pandas as pd
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from utils.constants import COUNTRY_CODE_MAPPING, EXCLUDE_REGIONS, DOWNLOAD_DIR
 from utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, find_unmatched_countries, \
     extract_year_month_from_path, traverse_and_process

+ 0 - 0
selenium_download.py → quanguo/selenium_download.py


+ 12 - 0
setup.py

@@ -34,6 +34,18 @@ setup(
     entry_points={
         'console_scripts': [
             'run-shandong=shandong.selenium_shandong_download:main',
+            'run-guangdong=guangdong.selenium_guangdong_download:main',
+            'run-guangdong-city=guangdong.selenium_guangdong_city:main',
+            'run-henan=henan.selenium_henan_download:main',
+            'run-fujian=fujian.selenium_fujian_download.py:main',
+            'run-anhui=anhui.crawl_gov_anhui_full:main',
+            'run-jiangsu=jiangsu.gov_commodity_jiangsu_country:main',
+            'run-hebei=hebei.crawl_gov_hebei_full:main',
+            'run-zhejiang=zhejiang.crawl_gov_zhejiang_full:main',
+            'run-quanguo=quanguo.selenium_download:main',
+
+            # 统一入口命令
+            'run-crossborder=crossborder.cli:main',
         ],
     },
 )

+ 1 - 1
shandong/selenium_shandong_download.py

@@ -9,7 +9,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from shandong.shandong_parse_excel import parse_excel
 from utils.constants import DOWNLOAD_DIR
 from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel

+ 1 - 1
shandong/shandong_parse_excel.py

@@ -4,7 +4,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 
-from db_helper import DBHelper
+from utils.db_helper import DBHelper
 from quanguo.CountryTrade import COUNTRY_CODE_MAPPING
 from utils.constants import DOWNLOAD_DIR
 from utils.log import log

+ 0 - 0
db_helper.py → utils/db_helper.py