Python 自动化之批量下载与自动压缩——省时省力
工作中有很多重复的下载任务——批量下载课件附件、下载网页上的所有图片、下载指定链接列表的文件。配合自动压缩打包一条命令就能替代手动逐个操作。一、批量下载1. 从 URL 列表下载importrequestsimportosfromconcurrent.futuresimportThreadPoolExecutor,as_completedimporttimeclassBatchDownloader:批量下载器def__init__(self,save_dirdownloads,max_workers5):self.save_dirsave_dir self.max_workersmax_workers os.makedirs(save_dir,exist_okTrue)defdownload_one(self,url,filenameNone):下载单个文件try:ifnotfilename:filenameurl.split(/)[-1]orindex.htmlfilepathos.path.join(self.save_dir,filename)# 断点续传headers{}ifos.path.exists(filepath):headers[Range]fbytes{os.path.getsize(filepath)}-resprequests.get(url,headersheaders,timeout30,streamTrue)resp.raise_for_status()modeabifos.path.exists(filepath)elsewbwithopen(filepath,mode)asf:forchunkinresp.iter_content(chunk_size8192):ifchunk:f.write(chunk)sizeos.path.getsize(filepath)print(f✅{filename}({size/1024:.1f}KB))returnfilepathexceptExceptionase:print(f❌{filenameorurl}:{e})returnNonedefdownload_many(self,urls,filenamesNone):多线程批量下载iffilenamesisNone:filenames[None]*len(urls)starttime.time()success0withThreadPoolExecutor(max_workersself.max_workers)asexecutor:futures{executor.submit(self.download_one,url,name):urlforurl,nameinzip(urls,filenames)}forfutureinas_completed(futures):iffuture.result():success1elapsedtime.time()-startprint(f\n下载完成{success}/{len(urls)}成功耗时{elapsed:.1f}s)# 使用downloaderBatchDownloader(教程附件,max_workers10)# 从文本文件读取 URL 列表withopen(urls.txt,r)asf:urls[line.strip()forlineinfifline.strip()]downloader.download_many(urls)2. 批量下载网页中的图片importrequestsfrombs4importBeautifulSoupimportosfromurllib.parseimporturljoindefdownload_images(page_url,save_dirimages):下载网页中的所有图片os.makedirs(save_dir,exist_okTrue)# 获取页面内容resprequests.get(page_url)soupBeautifulSoup(resp.text,html.parser)# 找到所有图片img_tagssoup.find_all(img)img_urls[]forimginimg_tags:srcimg.get(src)ifsrc:full_urlurljoin(page_url,src)# 只下载常见图片格式ifany(extinfull_url.lower()forextin[.jpg,.jpeg,.png,.gif,.webp]):img_urls.append(full_url)print(f找到{len(img_urls)}张图片)# 下载downloaderBatchDownloader(save_dir)downloader.download_many(img_urls)# 使用download_images(https://example.com/article/1)3. 下载 GitHub 目录不克隆整个仓库importrequestsdefdownload_github_dir(repo,dir_path,save_dirgithub_files):下载 GitHub 仓库中的指定目录api_urlfhttps://api.github.com/repos/{repo}/contents/{dir_path}resprequests.get(api_url)filesresp.json()urls[]names[]forfinfiles:iff[type]file:urls.append(f[download_url])names.append(f[name])downloaderBatchDownloader(save_dir)downloader.download_many(urls,names)# 使用download_github_dir(spring-projects/spring-boot,starters)二、自动压缩打包下载完文件后往往需要压缩打包再分发。1. 压缩整个文件夹importshutilimportosfromdatetimeimportdatetimedefzip_folder(folder_path,output_nameNone):压缩整个文件夹ifnotoutput_name:folder_nameos.path.basename(folder_path)todaydatetime.now().strftime(%Y%m%d)output_namef{folder_name}_{today}shutil.make_archive(output_name,zip,folder_path)print(f已压缩:{output_name}.zip ({os.path.getsize(output_name.zip)/1024/1024:.1f}MB))returnf{output_name}.zip# 使用zip_folder(downloads,课件包_20260630)2. 只压缩特定类型的文件importzipfiledefzip_selected_files(directory,extensions,output_nameselected):只压缩指定后缀名的文件withzipfile.ZipFile(f{output_name}.zip,w,zipfile.ZIP_DEFLATED)aszf:forroot,_,filesinos.walk(directory):forfinfiles:ifany(f.endswith(ext)forextinextensions):filepathos.path.join(root,f)arcnameos.path.relpath(filepath,directory)zf.write(filepath,arcname)print(f已添加:{arcname})print(f压缩完成:{output_name}.zip)# 使用只压缩 PDF 和 Word 文件zip_selected_files(课件,[.pdf,.docx],课件PDF版)3. 带密码的压缩# 使用 pyminizip 库# pip install pyminizipimportpyminizipdefzip_with_password(folder_path,password,output_nameprotected):带密码压缩pyminizip.compress_dir(folder_path,f{output_name}.zip,password,5)print(f已加密压缩:{output_name}.zip)# 使用zip_with_password(机密文件,mypassword123,机密文件_加密)三、自动删除源文件可选defclean_source(source_path):清理源文件ifos.path.isfile(source_path):os.remove(source_path)print(f已删除:{source_path})elifos.path.isdir(source_path):shutil.rmtree(source_path)print(f已删除目录:{source_path})四、完整流水线defdownload_and_pack(urls_file,output_name打包文件):下载 → 压缩 → 清理 一条龙base_dirtemp_download# 1. 读取 URLwithopen(urls_file,r)asf:urls[line.strip()forlineinfifline.strip()]print(f共{len(urls)}个文件待下载)# 2. 批量下载10 线程downloaderBatchDownloader(base_dir,max_workers10)resultsdownloader.download_many(urls)# 3. 检查是否全部成功success_countsum(1forrinresultsifr)print(f成功下载:{success_count}/{len(results)})ifsuccess_count0:# 4. 压缩zip_filezip_folder(base_dir,output_name)# 5. 清理临时文件shutil.rmtree(base_dir)print(f临时文件已清理)returnzip_file# 使用download_and_pack(urls.txt,课程资料包_202607)五、配合定时任务自动执行importscheduleimporttimedefdaily_download_task():每日定时下载任务print(f开始定时下载任务:{datetime.now()})# 1. 从 API 获取今日需要下载的文件列表resprequests.get(https://api.example.com/today/files)filesresp.json()urls[f[url]forfinfiles]names[f[name]forfinfiles]# 2. 下载downloaderBatchDownloader(每日更新)downloader.download_many(urls,names)# 3. 压缩zip_folder(每日更新,f更新包_{datetime.now().strftime(%Y%m%d)})print(任务完成)# 每天早上 8 点执行schedule.every().day.at(08:00).do(daily_download_task)whileTrue:schedule.run_pending()time.sleep(60)六、实际应用场景场景命令/代码下载课件附件列表downloader.download_many(url_list)爬虫结果打包zip_folder(crawled_data)给客户发加密文件zip_with_password(report, pwd123)每日备份自动下载schedule.every().day.at(23:00).do(backup)各种下载方式对比方式速度适合场景requests 单线程慢少量文件多线程下载快批量文件推荐wget 命令行中Linux 服务器aria2c极快大文件、多文件 觉得有用的话点赞 关注【张老师技术栈】吧每周更新 Java/Python/爬虫 实战干货不让你白来。