|
@@ -18,7 +18,7 @@ from crossborder.utils.log import log
|
|
|
|
|
|
base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
download_dir = DOWNLOAD_DIR / "total"
|
|
download_dir = DOWNLOAD_DIR / "total"
|
|
-
|
|
|
|
|
|
+downloaded_tables = set() # 已下载的表格名集合
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -113,14 +113,13 @@ def go_to_year_page(driver, year):
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
-def crawl_with_selenium(driver, base_url, year, latest_only=False):
|
|
|
|
|
|
+def crawl_with_selenium(driver, year, latest_only=False):
|
|
"""主抓取函数"""
|
|
"""主抓取函数"""
|
|
- driver.get(base_url)
|
|
|
|
-
|
|
|
|
- if not go_to_year_page(driver, year):
|
|
|
|
- log.warning(f"{year} 页面不可用,跳过")
|
|
|
|
- return
|
|
|
|
- log.info(f"开始抓取 {year} 年数据:{driver.current_url}")
|
|
|
|
|
|
+ if year < datetime.now().year:
|
|
|
|
+ if not go_to_year_page(driver, year):
|
|
|
|
+ log.warning(f"{year} 页面不可用,跳过")
|
|
|
|
+ return
|
|
|
|
+ log.info(f"开始抓取 {year} 年数据,当前标题: {driver.title}")
|
|
try:
|
|
try:
|
|
while True:
|
|
while True:
|
|
table = WebDriverWait(driver, 20).until(
|
|
table = WebDriverWait(driver, 20).until(
|
|
@@ -146,7 +145,7 @@ def crawl_with_selenium(driver, base_url, year, latest_only=False):
|
|
time.sleep(random.uniform(1, 3))
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
|
except StaleElementReferenceException:
|
|
except StaleElementReferenceException:
|
|
- log.info("检测到元素失效,自动刷新表格")
|
|
|
|
|
|
+ log.error("检测到元素失效,自动刷新表格")
|
|
driver.refresh()
|
|
driver.refresh()
|
|
WebDriverWait(driver, 30).until(
|
|
WebDriverWait(driver, 30).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{year}RMB"))
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{year}RMB"))
|
|
@@ -158,9 +157,14 @@ def sanitize_filename(filename):
|
|
|
|
|
|
|
|
|
|
def handle_month_data(driver, table_name, month_links, year, latest_only):
|
|
def handle_month_data(driver, table_name, month_links, year, latest_only):
|
|
|
|
+ global downloaded_tables
|
|
main_window = driver.current_window_handle
|
|
main_window = driver.current_window_handle
|
|
for idx, month_data in enumerate(month_links):
|
|
for idx, month_data in enumerate(month_links):
|
|
if 1 <= month_data[0] <= 12:
|
|
if 1 <= month_data[0] <= 12:
|
|
|
|
+ # 年度表月度表只下载一次(最新月份数据)
|
|
|
|
+ if "进出口商品总值表" in table_name and table_name in downloaded_tables:
|
|
|
|
+ log.info(f"【{table_name}】已下载过,跳过")
|
|
|
|
+ continue
|
|
# 新标签页策略(防止主页面DOM变更)
|
|
# 新标签页策略(防止主页面DOM变更)
|
|
driver.switch_to.window(main_window)
|
|
driver.switch_to.window(main_window)
|
|
driver.execute_script(f"window.open('{month_data[1]}', '_blank_{idx}')")
|
|
driver.execute_script(f"window.open('{month_data[1]}', '_blank_{idx}')")
|
|
@@ -169,6 +173,8 @@ def handle_month_data(driver, table_name, month_links, year, latest_only):
|
|
month_num, link = month_data
|
|
month_num, link = month_data
|
|
try:
|
|
try:
|
|
download_excel(driver, link, year, month_num, table_name, download_dir)
|
|
download_excel(driver, link, year, month_num, table_name, download_dir)
|
|
|
|
+ # 下载成功后将表格名加入集合
|
|
|
|
+ downloaded_tables.add(table_name)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
log.info(f"【异常】下载失败: {str(e)}")
|
|
log.info(f"【异常】下载失败: {str(e)}")
|
|
time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
|
|
time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
|
|
@@ -190,11 +196,12 @@ if __name__ == "__main__":
|
|
options = configure_stealth_options(download_dir)
|
|
options = configure_stealth_options(download_dir)
|
|
driver = webdriver.Firefox(options=options)
|
|
driver = webdriver.Firefox(options=options)
|
|
|
|
|
|
|
|
+ base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
|
|
+ driver.get(base_url)
|
|
try:
|
|
try:
|
|
for year in years_to_crawl:
|
|
for year in years_to_crawl:
|
|
- base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
|
|
log.info(f"\n【{year}年】开始抓取...".center(66, "-"))
|
|
log.info(f"\n【{year}年】开始抓取...".center(66, "-"))
|
|
- crawl_with_selenium(driver, base_url, year=year, latest_only=args.year is None)
|
|
|
|
|
|
+ crawl_with_selenium(driver, year=year, latest_only=args.year is None)
|
|
finally:
|
|
finally:
|
|
driver.quit()
|
|
driver.quit()
|
|
log.info("【海关总署】全年数据抓取结束".center(66, "*"))
|
|
log.info("【海关总署】全年数据抓取结束".center(66, "*"))
|