|
@@ -24,7 +24,7 @@ log = get_logger(__name__)
|
|
|
base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
|
download_dir = DOWNLOAD_DIR / "total"
|
|
|
downloaded_tables = set() # 已下载的表格名集合
|
|
|
-
|
|
|
+data_collected = False # 是否有数据被采集
|
|
|
|
|
|
|
|
|
def generate_table_title(year):
|
|
@@ -118,7 +118,8 @@ def go_to_year_page(driver, year):
|
|
|
return False
|
|
|
|
|
|
|
|
|
-def crawl_with_selenium(driver, year, latest_only=False,data_collected=False):
|
|
|
+def crawl_with_selenium(driver, year, latest_only=False):
|
|
|
+ global data_collected
|
|
|
"""主抓取函数"""
|
|
|
if year < datetime.now().year:
|
|
|
if not go_to_year_page(driver, year):
|
|
@@ -195,7 +196,7 @@ def handle_month_data(driver, table_name, month_links, year):
|
|
|
# 下载成功后将表格名加入集合
|
|
|
downloaded_tables.add(table_name)
|
|
|
except Exception as e:
|
|
|
- log.info(f"【异常】下载失败: {str(e)}")
|
|
|
+ log.error(f"【异常】{year}-{month_num:02d} {table_name}: {str(e)}")
|
|
|
time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
|
|
|
def main():
|
|
|
|
|
@@ -206,7 +207,6 @@ def main():
|
|
|
|
|
|
start_time = time.time()
|
|
|
years_processed = [] # 记录成功处理的年份
|
|
|
- data_collected = False # 是否有数据被采集
|
|
|
driver = None
|
|
|
|
|
|
try:
|
|
@@ -236,12 +236,12 @@ def main():
|
|
|
# 4. 年份遍历采集
|
|
|
for year in years_to_crawl:
|
|
|
try:
|
|
|
- log.info(f"\n【{year}年】开始处理".center(66, "-"))
|
|
|
+ log.info(f"【{year}年】开始处理".center(66, "-"))
|
|
|
|
|
|
is_latest_only = (not args.year) and (year == current_year)
|
|
|
|
|
|
# 执行年份采集
|
|
|
- crawl_with_selenium(driver, year=year, latest_only=is_latest_only,data_collected = data_collected)
|
|
|
+ crawl_with_selenium(driver, year=year, latest_only=is_latest_only)
|
|
|
years_processed.append(year)
|
|
|
|
|
|
log.info(f"【{year}年】处理完成".center(66, "-"))
|
|
@@ -263,11 +263,11 @@ def main():
|
|
|
|
|
|
# 6. 数据清洗入库(仅当有数据被采集时)
|
|
|
if data_collected:
|
|
|
- log.info("\n【海关总署】数据清洗入库开始".center(66, "*"))
|
|
|
+ log.info("【海关总署】数据清洗入库开始".center(66, "*"))
|
|
|
|
|
|
try:
|
|
|
log.info("数据清洗入库中...")
|
|
|
- perform_data_cleanup_and_import(years_processed)
|
|
|
+ perform_data_cleanup_and_import(current_year)
|
|
|
log.info("数据清洗入库完毕")
|
|
|
|
|
|
except Exception as e:
|