download_utils.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. import re
  2. import threading
  3. import time
  4. import traceback
  5. from pathlib import Path
  6. from faker import Faker
  7. from selenium.webdriver import FirefoxOptions, ActionChains
  8. from selenium.webdriver.common.by import By
  9. from selenium.webdriver.support import expected_conditions as EC
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from utils.log import log
  12. DOWNLOAD_TIMEOUT = 60
  13. download_lock = threading.Lock()
  14. def configure_stealth_options(download_dir):
  15. """反检测浏览器配置"""
  16. opts = FirefoxOptions()
  17. opts.set_preference("dom.webdriver.enabled", False)
  18. opts.add_argument("--disable-blink-features=AutomationControlled")
  19. opts.add_argument("--headless")
  20. opts.add_argument("--window-size=1366,900") # 固定视口
  21. opts.set_preference("general.useragent.override", Faker().user_agent())
  22. opts.set_preference("browser.download.dir", str(download_dir))
  23. opts.set_preference("browser.download.folderList", 2)
  24. opts.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.ms-excel")
  25. opts.set_preference("pdfjs.disabled", True)
  26. opts.set_preference("download.prompt_for_download", False)
  27. opts.set_preference("download.directory_upgrade", True)
  28. return opts
  29. def generate_month_sequence(start_year, start_month, end_year=None, skip_january=False):
  30. """
  31. 动态生成倒序月份序列
  32. Args:
  33. start_year (int): 检测到的最新数据年份
  34. start_month (int): 检测到的最新数据月份
  35. end_year (int, optional): 终止年份(默认 None)
  36. skip_january (bool, optional): 是否跳过所有1月数据(默认 False)
  37. Returns:
  38. List[Tuple[int, int]]: 月份序列列表,格式为 [(year, month), ...]
  39. """
  40. sequence = []
  41. current_year = start_year
  42. current_month = start_month
  43. # 当指定终止年份时
  44. if end_year:
  45. while not (current_year == end_year and current_month < 1):
  46. # 跳过1月判断
  47. if not (skip_january and current_month == 1):
  48. sequence.append((current_year, current_month))
  49. # 跨年处理
  50. if current_month == 1:
  51. current_year -= 1
  52. current_month = 12
  53. else:
  54. current_month -= 1
  55. # 终止条件:到达目标年份的1月
  56. if current_year < end_year:
  57. break
  58. else:
  59. # 未指定年份时取最近两个月
  60. if not (skip_january and current_month == 1):
  61. sequence.append((current_year, current_month))
  62. prev_year, prev_month = get_previous_month(current_year, current_month)
  63. if not (skip_january and prev_month == 1):
  64. sequence.append((prev_year, prev_month))
  65. return sequence
  66. def get_previous_month(year, month):
  67. """跨年月份计算"""
  68. if month == 1:
  69. return year - 1, 12
  70. return year, month - 1
  71. def download_excel(driver, url, year, month, title, download_dir):
  72. """文件下载模块"""
  73. download_dir.mkdir(parents=True, exist_ok=True)
  74. driver.execute_script(f"window.open('{url}')")
  75. driver.switch_to.window(driver.window_handles[-1])
  76. try:
  77. download_btn = WebDriverWait(driver, 20).until(
  78. EC.element_to_be_clickable(
  79. (By.XPATH,
  80. '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
  81. )
  82. )
  83. # download_btn.click()
  84. driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", download_btn)
  85. # 使用 ActionChains 安全点击
  86. ActionChains(driver).move_to_element(download_btn).pause(0.3).click().perform()
  87. # log.info(f"√ 已点击下载按钮:{download_btn.get_attribute("href")}")
  88. downloaded_file = wait_for_download(download_dir)
  89. final_path = Path(f'{download_dir}/{year}/{month:02d}/{title}{downloaded_file.suffix}')
  90. if final_path.exists():
  91. final_path.unlink()
  92. download_rel_dir = Path(f'{download_dir}/{year}/{month:02d}')
  93. download_rel_dir.mkdir(parents=True, exist_ok=True)
  94. downloaded_file.rename(final_path)
  95. log.info(f"√ 文件已保存至:{final_path}")
  96. finally:
  97. driver.close()
  98. driver.switch_to.window(driver.window_handles[0])
  99. # def batch_download_excel(driver, url, year, month, base_title, download_dir):
  100. # """批量下载Excel文件模块"""
  101. # download_dir.mkdir(parents=True, exist_ok=True)
  102. #
  103. # driver.execute_script(f"window.open('{url}')")
  104. # driver.switch_to.window(driver.window_handles[-1])
  105. #
  106. # try:
  107. # # 获取所有Excel下载按钮
  108. # download_btns = WebDriverWait(driver, 20).until(
  109. # EC.presence_of_all_elements_located(
  110. # (By.XPATH,
  111. # '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
  112. # )
  113. # )
  114. #
  115. # for index, btn in enumerate(download_btns):
  116. # # 生成唯一标题(可自定义规则)
  117. # title = f"{base_title}_{index + 1}"
  118. #
  119. # # 点击下载按钮
  120. # btn.click()
  121. #
  122. # # 等待下载完成
  123. # downloaded_file = wait_for_download(download_dir)
  124. #
  125. # # 处理文件路径
  126. # final_dir = download_dir / f'{year}' / f'{month:02d}'
  127. # final_dir.mkdir(parents=True, exist_ok=True)
  128. # final_path = final_dir / f'{title}{downloaded_file.suffix}'
  129. #
  130. # # 重命名文件
  131. # if final_path.exists():
  132. # final_path.unlink()
  133. # downloaded_file.rename(final_path)
  134. # log.info(f"√ 文件 {title} 已保存至:{final_path}")
  135. #
  136. # finally:
  137. # driver.close()
  138. # driver.switch_to.window(driver.window_handles[0])
  139. def batch_download_excel(driver, url, base_year, base_month, base_title, download_dir):
  140. """批量下载Excel文件模块 - 支持根据文件名自动识别并分类到对应月份"""
  141. download_dir.mkdir(parents=True, exist_ok=True)
  142. driver.execute_script(f"window.open('{url}')")
  143. driver.switch_to.window(driver.window_handles[-1])
  144. try:
  145. # 获取所有Excel下载按钮
  146. download_btns = WebDriverWait(driver, 20).until(
  147. EC.presence_of_all_elements_located(
  148. (By.XPATH,
  149. '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
  150. )
  151. )
  152. for index, btn in enumerate(download_btns):
  153. raw_title = btn.text.strip() or btn.get_attribute("textContent").strip()
  154. if not raw_title:
  155. log.warning("⚠️ 标题为空,跳过处理")
  156. continue
  157. # 提取年份
  158. year_match = re.search(r'(\d{4})\s*年', raw_title)
  159. file_year = int(year_match.group(1)) if year_match else base_year
  160. # 提取月份(支持多种形式)
  161. month_match = re.search(
  162. r'(?:1[--]\s*)?(\d{1,2})月|前?(\d{1,2})个?月|第([一二三四])季度',
  163. raw_title
  164. )
  165. file_month = base_month
  166. if month_match:
  167. digit_groups = month_match.groups()
  168. if digit_groups[0]:
  169. file_month = int(digit_groups[0])
  170. elif digit_groups[1]:
  171. file_month = int(digit_groups[1])
  172. elif digit_groups[2]:
  173. quarter_map = {"一": 3, "二": 6, "三": 9, "四": 12}
  174. file_month = quarter_map.get(digit_groups[2], base_month)
  175. # 确保月份合法
  176. if not (1 <= file_month <= 12):
  177. file_month = base_month
  178. # 构建目录
  179. final_dir = download_dir / f'{file_year}' / f'{file_month:02d}'
  180. final_dir.mkdir(parents=True, exist_ok=True)
  181. # 下载文件
  182. btn.click()
  183. downloaded_file = wait_for_download(download_dir)
  184. # 构建最终路径(去除已有后缀,防止重复)
  185. clean_title = Path(raw_title).stem
  186. final_path = final_dir / f'{clean_title}{downloaded_file.suffix}'
  187. # 删除已存在文件并重命名
  188. if final_path.exists():
  189. final_path.unlink()
  190. downloaded_file.rename(final_path)
  191. log.info(f"√ 文件 {clean_title} 已保存至:{final_path}")
  192. finally:
  193. driver.close()
  194. driver.switch_to.window(driver.window_handles[0])
  195. def download_excel2(driver, link, year, month, title, download_dir):
  196. download_dir = Path(download_dir)
  197. download_dir.mkdir(parents=True, exist_ok=True)
  198. try:
  199. log.info(f"正在点击链接:{title}")
  200. link.click()
  201. log.info("等待文件下载完成...")
  202. downloaded_file = wait_for_download(download_dir)
  203. if not downloaded_file.suffix:
  204. downloaded_file = downloaded_file.with_suffix('.xlsx')
  205. final_dir = download_dir / f'{year}' / f'{month:02d}'
  206. final_dir.mkdir(parents=True, exist_ok=True)
  207. final_path = final_dir / f'{title}{downloaded_file.suffix}'
  208. if final_path.exists():
  209. final_path.unlink()
  210. downloaded_file.rename(final_path)
  211. log.info(f"√ 文件已保存至:{final_path}")
  212. except TimeoutError as te:
  213. log.info(f"[错误] 文件下载超时:{te}")
  214. raise
  215. except Exception as e:
  216. log.info(f"[错误] 发生异常:{e}")
  217. log.info(traceback.format_exc())
  218. raise
  219. # def wait_for_download(directory):
  220. # """文件下载监控(只读取文件,忽略文件夹)"""
  221. # start_time = time.time()
  222. # while (time.time() - start_time) < DOWNLOAD_TIMEOUT:
  223. # files = [
  224. # f for f in directory.glob('*')
  225. # if f.is_file() and not f.name.endswith(('.part', '.crdownload'))
  226. # ]
  227. # if files:
  228. # # 按照创建时间排序并返回最新文件
  229. # return max(files, key=lambda x: x.stat().st_ctime)
  230. # time.sleep(1)
  231. # raise TimeoutError("文件下载超时")
  232. def wait_for_download(directory):
  233. """文件下载监控(只读取文件,忽略文件夹)"""
  234. start_time = time.time()
  235. while (time.time() - start_time) < DOWNLOAD_TIMEOUT:
  236. with download_lock:
  237. files = [
  238. f for f in directory.glob('*')
  239. if f.is_file() and not f.name.endswith(('.part', '.crdownload'))
  240. ]
  241. if files:
  242. # 按照创建时间排序并返回最新文件
  243. return max(files, key=lambda x: x.stat().st_ctime)
  244. time.sleep(1)
  245. raise TimeoutError("文件下载超时")