|
@@ -1,3 +1,4 @@
|
|
|
+import argparse
|
|
|
import os
|
|
|
import random
|
|
|
import re
|
|
@@ -15,12 +16,12 @@ from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
-from zhejiang import download_dir
|
|
|
-from zhejiang import gov_commodity_zhejiang_city
|
|
|
-from zhejiang import gov_commodity_zhejiang_country
|
|
|
-from zhejiang import gov_commodity_zhejiang_import_export
|
|
|
-from utils import base_country_code, base_mysql
|
|
|
-from utils.log import log
|
|
|
+from crossborder.zhejiang import download_dir
|
|
|
+from crossborder.zhejiang import gov_commodity_zhejiang_city
|
|
|
+from crossborder.zhejiang import gov_commodity_zhejiang_country
|
|
|
+from crossborder.zhejiang import gov_commodity_zhejiang_import_export
|
|
|
+from crossborder.utils import base_country_code, base_mysql
|
|
|
+from crossborder.utils.log import log
|
|
|
|
|
|
def configure_stealth_options():
|
|
|
"""增强型反检测配置[1,4](@ref)"""
|
|
@@ -120,7 +121,7 @@ def process_month_tabs(driver, year, base_url, year_month):
|
|
|
retry_count += 1
|
|
|
break
|
|
|
if tar_month != month_text:
|
|
|
- log.info(f"{year}年 {month_text} 月份跳过, increment tar: {year_month}")
|
|
|
+ log.info(f"{year}年 {month_text} 月份跳过, auto tar: {year_month}")
|
|
|
continue
|
|
|
a_tag.click()
|
|
|
|
|
@@ -283,7 +284,7 @@ def crawl_with_selenium(url, mark):
|
|
|
driver = webdriver.Firefox(options=configure_stealth_options())
|
|
|
|
|
|
year_month = None
|
|
|
- if 'increment' == mark:
|
|
|
+ if 'auto' == mark:
|
|
|
res = detect_latest_month(driver, url)
|
|
|
if res is None:
|
|
|
log.info("浙江省海关没有最新数据更新")
|
|
@@ -380,17 +381,14 @@ def hierarchical_traversal(root_path):
|
|
|
gov_commodity_zhejiang_city.process_folder(md['path'])
|
|
|
|
|
|
def main():
|
|
|
- # crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
|
|
|
- crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'increment')
|
|
|
- # log.info(f"浙江杭州海关全量数据下载任务完成")
|
|
|
- # # 等待5s后执行
|
|
|
- # time.sleep(5)
|
|
|
- # hierarchical_traversal(download_dir)
|
|
|
- # log.info("浙江杭州海关类章、国家、城市所有文件处理完成!")
|
|
|
- # time.sleep(5)
|
|
|
- # base_mysql.update_january_yoy('浙江省')
|
|
|
- # base_mysql.update_shandong_yoy('浙江省')
|
|
|
- # log.info("浙江杭州海关城市同比sql处理完成")
|
|
|
+ parser = argparse.ArgumentParser(description="爬取模式: 全量(all) 或 增量(auto)")
|
|
|
+ parser.add_argument("mode", choices=["all", "auto"], help="运行模式")
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ if args.mode == "all":
|
|
|
+ crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
|
|
|
+ else:
|
|
|
+ crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html','auto')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|