|
@@ -1,26 +1,19 @@
|
|
|
import argparse
|
|
|
-import os
|
|
|
import random
|
|
|
import re
|
|
|
import time
|
|
|
from datetime import datetime
|
|
|
-from pathlib import Path
|
|
|
|
|
|
-from faker import Faker
|
|
|
from selenium import webdriver
|
|
|
from selenium.common import StaleElementReferenceException
|
|
|
-from selenium.webdriver import FirefoxOptions, ActionChains
|
|
|
+from selenium.webdriver import ActionChains
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import
|
|
|
-from crossborder.quanguo.parse_country_table_excel import parse_country_table_excel
|
|
|
-from crossborder.quanguo.parse_month_excel import parse_month_table_excel
|
|
|
-from crossborder.quanguo.parse_year_excel import parse_year_table_excel
|
|
|
from crossborder.utils.constants import DOWNLOAD_DIR
|
|
|
-from crossborder.utils.download_utils import configure_stealth_options, wait_for_download, download_excel
|
|
|
-from selenium.webdriver.common.by import By
|
|
|
-
|
|
|
+from crossborder.utils.download_utils import configure_stealth_options, download_excel
|
|
|
from crossborder.utils.log import log
|
|
|
|
|
|
base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
@@ -40,36 +33,62 @@ def generate_table_title(year):
|
|
|
f"(16){year}年自部分国家(地区)进口商品类章金额表"
|
|
|
]
|
|
|
|
|
|
+
|
|
|
def process_table_row(row):
|
|
|
- """动态处理表格行数据(Selenium语法)"""
|
|
|
+ """更健壮的表格行处理函数"""
|
|
|
try:
|
|
|
- # 获取所有表格单元格(td)元素
|
|
|
- cells = row.find_elements(By.TAG_NAME, 'td')
|
|
|
+ # 使用相对定位获取单元格
|
|
|
+ cells = WebDriverWait(row, 15).until(
|
|
|
+ EC.presence_of_all_elements_located((By.XPATH, "./td"))
|
|
|
+ )
|
|
|
+
|
|
|
if len(cells) < 2:
|
|
|
return None
|
|
|
|
|
|
- # 获取表格名
|
|
|
- table_name = cells[0].text.strip()
|
|
|
+ # 使用文本内容稳定性检查
|
|
|
+ table_name = None
|
|
|
+ for attempt in range(3):
|
|
|
+ try:
|
|
|
+ table_name = cells[0].text.strip()
|
|
|
+ if table_name: # 确认文本稳定获取
|
|
|
+ break
|
|
|
+ except StaleElementReferenceException:
|
|
|
+ # 重新获取单元格
|
|
|
+ cells = row.find_elements(By.XPATH, "./td")
|
|
|
+ if len(cells) < 2:
|
|
|
+ return None
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ if not table_name:
|
|
|
+ return None
|
|
|
|
|
|
- # 获取第二列中的所有链接,提取月份和href
|
|
|
+ # 月份链接处理(使用更稳定的XPATH)
|
|
|
month_links = []
|
|
|
- links = cells[1].find_elements(By.TAG_NAME, 'a')
|
|
|
- for a in links:
|
|
|
- # 获取文本并去掉‘月’
|
|
|
- month_text = a.text
|
|
|
- if '月' in month_text:
|
|
|
- month = int(month_text.replace('月', '').strip())
|
|
|
- href = a.get_attribute('href')
|
|
|
- if href:
|
|
|
- month_links.append((month, href))
|
|
|
-
|
|
|
- # 按月份倒序排列(1-12月)
|
|
|
- month_links.sort(key=lambda x: x[0], reverse=True)
|
|
|
+ link_elements = cells[1].find_elements(By.XPATH, ".//a")
|
|
|
+
|
|
|
+ for a in link_elements:
|
|
|
+ try:
|
|
|
+ # 添加临时等待避免元素状态变化
|
|
|
+ time.sleep(0.2)
|
|
|
+ month_text = a.text.strip()
|
|
|
+ if '月' in month_text:
|
|
|
+ month = int(month_text.replace('月', '').strip())
|
|
|
+ href = a.get_attribute('href')
|
|
|
+ if href:
|
|
|
+ month_links.append((month, href))
|
|
|
+ except StaleElementReferenceException:
|
|
|
+ continue # 跳过已无效的链接
|
|
|
+ except Exception as e:
|
|
|
+ log.debug(f"处理链接异常: {str(e)}")
|
|
|
+
|
|
|
+ # 如果获取到链接再排序
|
|
|
+ if month_links:
|
|
|
+ month_links.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
|
|
return (table_name, month_links)
|
|
|
|
|
|
except Exception as e:
|
|
|
- log.info(f"表格行处理异常: {str(e)}")
|
|
|
+ log.info(f"表格行处理异常: {str(e)}", exc_info=True)
|
|
|
return None
|
|
|
|
|
|
|
|
@@ -181,3 +200,4 @@ if __name__ == "__main__":
|
|
|
log.info("【海关总署】全年数据抓取结束".center(66, "*"))
|
|
|
log.info("\n数据清洗入库中...")
|
|
|
perform_data_cleanup_and_import(current_year)
|
|
|
+ log.info("\n数据清洗入库完毕...")
|