当前位置：首页 > news >正文

NLPCC 出版部分相关源码记录

news 2025/7/16 4:49:23

Download

Unzip

Author

Title

Affiliation

Check number of tex

Zip

Rename

Delete

Download

import requests
from bs4 import BeautifulSoup# 登录网站并获取登录后的 session
def login(username, password):login_url = 'https://example.com/login'session = requests.session()login_data = {'username': username,'password': password,# 其他登录参数}response = session.post(login_url, data=login_data)if response.status_code == 200:print("登录成功！")return sessionelse:print("登录失败！")return None# 获取文件列表页面中的文件链接
def get_file_links(session, file_list_url):response = session.get(file_list_url)soup = BeautifulSoup(response.text, 'html.parser')file_links = []# 使用 BeautifulSoup 解析文件列表页面，获取文件链接# 例如：file_links = soup.find_all('a', class_='file-link')return file_links# 批量下载文件
def download_files(session, file_links, download_path):for link in file_links:file_url = link['href']file_name = link.text.strip()response = session.get(file_url, stream=True)if response.status_code == 200:# 保存文件到本地with open(f"{download_path}/{file_name}", 'wb') as file:for chunk in response.iter_content(chunk_size=8192):file.write(chunk)print(f"{file_name} 下载成功！")else:print(f"{file_name} 下载失败！")def main():username = 'your_username'password = 'your_password'file_list_url = 'https://example.com/files'  # 文件列表页面的 URLdownload_path = 'downloaded_files'  # 本地下载路径# 登录网站并获取登录后的 sessionsession = login(username, password)if session:# 获取文件列表页面中的文件链接file_links = get_file_links(session, file_list_url)if file_links:# 批量下载文件download_files(session, file_links, download_path)else:print("未找到文件链接！")else:print("登录失败，请检查用户名和密码！")# if __name__ == "__main__":
#     main()

import requests
from bs4 import BeautifulSoupdef login(username, password):login_url = 'https://softconf.com/nlpcc/Main-2023/login/scmd.cgi?scmd=login'session = requests.session()login_data = {"username": username,"password": password}response = session.post(login_url, data=login_data)# print(response.text)if response.status_code == 200:print("登录成功！")return sessionelse:print("登录失败！")return None

username, passwd = "用户名", "密码"
session = login(username, passwd)

import reids = {214,215,220,221,222,225,229,233,235,238,239,241,246,250,251,252,254,256,258,260,264,271,285,292,299,301,306,307,308,}
file_list_url = "https://softconf.com/nlpcc/Main-2023/pub/scmd.cgi?scmd=manager&ToPage=monitorFinalSubmissions&FromPage=Main"
response = session.get(file_list_url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', id='t1')
links = table.find_all('a')
all_urls = [link.get('href') for link in links]
urls = []
for i in range(len(all_urls)):if all_urls[i] and all_urls[i].startswith('scmd.cgi?scmd=submitPaperCustom'):if (m := re.search(r"passcode=(\d+)X-.+", all_urls[i])) is not None:# print(m.group(1))if int(m.group(1)) in ids:urls.append((int(m.group(1)), "https://softconf.com/nlpcc/Main-2023/pub/"+all_urls[i]))
print(len(urls)==len(ids))
print(urls)

import time
import os
from tqdm.auto import tqdmdef download_files(session, urls:dict, paper_id:int):for file_name, file_url in urls.items():response = session.get(file_url, stream=True)save_dir = f"./downloads/{paper_id}/"os.makedirs(save_dir, exist_ok=True)if response.status_code == 200:# 保存文件到本地with open(f"{save_dir}/{file_name}", 'wb') as file:for chunk in response.iter_content(chunk_size=8192):file.write(chunk)# print(f"{paper_id}_{file_name} 下载成功！")else:print(f"{paper_id}_{file_name} 下载失败！")for paper_id, url in tqdm(urls):response = session.get(url)soup = BeautifulSoup(response.text, 'html.parser')links = soup.find_all('a')urls_ = map(lambda link: link.get('href') if link else "", links)pdf_url = [link.get('href') for link in links if link.get('href') and link.get('href').endswith("fieldid=Final_Manuscript")][0]zip_url = [link.get('href') for link in links if link.get('href') and link.get('href').endswith("fieldid=Source_File")][0]copyright_url = [link.get('href') for link in links if link.get('href') and link.get('href').endswith("fieldid=CopyRight_Springer")][0]downloads_urls = {"Final_Manuscript.pdf": pdf_url, "Source_File.zip":zip_url, "CopyRight.pdf":copyright_url}downloads_urls = {"CopyRight.pdf":copyright_url}# print(downloads_urls)try:download_files(session, downloads_urls, paper_id)except:pass# breaktime.sleep(2)

Unzip

import zipfile
import os
import pathlibdef unzip_file(zip_filepath, dest_path):with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:zip_ref.extractall(dest_path)# 使用方法
root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():try:unzip_file(directory/"Source_File.zip", directory/"Source_File")except Exception as e:print(e)print(directory)# break

import pathlibroot_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():path = directory/"Source_File"path_true = pathlib.Path(path)dir_outputs_tex_true = path_true/"outputs_tex"dir_outputs_tex_true.mkdir(exist_ok=True)if (path/"submission.tex").exists():dir_outputs_tex = pathlib.PurePosixPath("outputs_tex")path_tex = pathlib.PurePosixPath("submission.tex")path_aux = dir_outputs_tex/"submission.aux"! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}! cd {path_true} & bibtex {path_aux}! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}else:print(directory)

def compile2pdf(directory):directory = pathlib.Path(directory)path = directory/"Source_File"path_true = pathlib.Path(path)dir_outputs_tex_true = path_true/"outputs_tex"dir_outputs_tex_true.mkdir(exist_ok=True)if (path/"submission.tex").exists():dir_outputs_tex = pathlib.PurePosixPath("outputs_tex")path_tex = pathlib.PurePosixPath("submission.tex")path_aux = dir_outputs_tex/"submission.aux"! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}! cd {path_true} & bibtex {path_aux}! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}else:print(directory)compile2pdf("downloads/306")

def is_same_file(file1, file2):with open(file1, 'rb') as f1, open(file2, 'rb') as f2:return f1.read() == f2.read()import PyPDF2from PyPDF2 import PdfReaderdef extract_text_from_pdf(file_path):with open(file_path, 'rb') as file:pdf = PdfReader(file)text = ""for page in range(len(pdf.pages)):text += pdf.pages[page].extract_text()return text, len(pdf.pages)def compare_pdfs(file_path1, file_path2):text1, n_1 = extract_text_from_pdf(file_path1)text2, n_2 = extract_text_from_pdf(file_path2)return text1 == text2, n_1, n_2root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():camera_ready = directory/"Final_Manuscript.pdf"compiled = directory/"Source_File"/"outputs_tex"/"submission.pdf"try: ok, n1, n2 = compare_pdfs(camera_ready, compiled)if not ok:print(f"Not same: {directory}")print(n1, n2, sep='    ')except Exception as e:print(e)print(f"Fail to compare: {directory}")print("=========================================================================")

Author

import redef extract_author(tex_file_path):with open(tex_file_path, 'r', encoding='utf-8') as tex_file:tex_content = tex_file.read()# Use regular expression to find the \author partpattern = r"^\\author{\s*(.*?)\s*}\s+\%"matches = re.search(pattern, tex_content, re.DOTALL|re.MULTILINE)if matches:return matches.group(1)else:return ""tex_file_path = "downloads\\215\\Source_File\\submission.tex"  # Replace with the path to your .tex file
author = extract_author(tex_file_path)authors = []
root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():tex_file_path = directory/"Source_File"/"submission.tex"print(f"------{directory}---------")if tex_file_path.exists():author = extract_author(tex_file_path)# author = re.sub(r"\\.*", "", author)# author = re.sub(r"[^\w\s]", "", author)# author = re.sub(r"\s*?\n\s*", ",", author)# author = author[:-1] if author.endswith(',') else author# author = re.sub(r'(?<=,)(?=[^,]*$)', 'and ', author)  #将最后一个逗号换成 `and`# # author = re.sub(r',(?=[^,]*$)', ' and ', author)  #将最后一个逗号换成 `and`authors.append(author)print(author)else:print(f"Fail to open tex: {tex_file_path}")authors.append("")print('====================================================================')

import pandas as pd# 将列表转换为DataFrame
df = pd.DataFrame(authors, columns=["author"])# 保存DataFrame到Excel文件
file_path = "./author.xlsx"
df.to_excel(file_path, index=False)

Title

import redef extract_title(tex_file_path):with open(tex_file_path, 'r', encoding='utf-8') as tex_file:tex_content = tex_file.read()# Use regular expression to find the \author partpattern = r"^\\title{\s*(.*?)\s*}\s+\%"matches = re.search(pattern, tex_content, re.DOTALL|re.MULTILINE)if matches:return matches.group(1)else:return ""# tex_file_path = "downloads\\215\\Source_File\\submission.tex"  # Replace with the path to your .tex file
# author = extract_author(tex_file_path)authors = []
root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():tex_file_path = directory/"Source_File"/"submission.tex"print(f"------{directory}---------")if tex_file_path.exists():author = extract_title(tex_file_path)author = re.sub(r"\s*\\\\\s*", " ", author)author = re.sub(r"\\.*", "", author)authors.append(author)print(author)else:print(f"Fail to open tex: {tex_file_path}")authors.append("")print('====================================================================')

import pandas as pd# 将列表转换为DataFrame
df = pd.DataFrame(authors, columns=["title"])# 保存DataFrame到Excel文件
file_path = "./title.xlsx"
df.to_excel(file_path, index=False)

Affiliation

import redef extract_affiliation(tex_file_path):with open(tex_file_path, 'r', encoding='utf-8') as tex_file:tex_content = tex_file.read()# Use regular expression to find the \author partpattern = r"^\\institute{\s*(.*?)\s*}\s+\%"matches = re.search(pattern, tex_content, re.DOTALL|re.MULTILINE)if matches:return matches.group(1)else:return ""# tex_file_path = "downloads\\215\\Source_File\\submission.tex"  # Replace with the path to your .tex file
# author = extract_author(tex_file_path)authors = []
root_dir = pathlib.Path("./downloads/")
i = 2
for directory in root_dir.iterdir():tex_file_path = directory/"Source_File"/"submission.tex"print(f"------{i} {directory}---------")i += 1if tex_file_path.exists():author = extract_affiliation(tex_file_path)# author = re.sub(r"\s*\\\\\s*", " ", author)# author = re.sub(r"\\.*", "", author)authors.append(author)print(author)else:print(f"Fail to open tex: {tex_file_path}")authors.append("")print('====================================================================')

import pandas as pd# 将列表转换为DataFrame
df = pd.DataFrame(authors, columns=["affiliation"])# 保存DataFrame到Excel文件
file_path = "./affiliation.xlsx"
df.to_excel(file_path, index=False)

Check number of tex

import pathlib
root_dir = pathlib.Path("./downloads/")def num_tex(dirctory: pathlib.Path):num = 0for d in dirctory.iterdir():num += (d.suffix=='.tex')return numfor d in root_dir.iterdir():src = d/"Source_File"if num_tex(src)>1:print(d)

Zip

import os
import zipfiledef zip_directory(directory_path, zip_path):"""压缩目录到zip文件:param directory_path: 要压缩的目录路径:param zip_path: zip文件保存路径"""with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:for root, _, files in os.walk(directory_path):for file in files:file_path = os.path.join(root, file)zipf.write(file_path, os.path.relpath(file_path, directory_path))# # 示例用法
# directory_to_compress = '/path/to/source_directory'
# zip_file_path = '/path/to/destination.zip'
# zip_directory(directory_to_compress, zip_file_path)

Rename

import pathlib
root_dir = pathlib.Path("./downloads/")for d in list(root_dir.iterdir()):src = d/"Source_File"zip_directory(src, src.parent/"source.zip")submi = d/"Final_Manuscript.pdf"submi.rename(submi.with_name("submission.pdf"))cprt = d/"CopyRight.pdf"cprt.rename(cprt.rename(cprt.with_name("copyright.pdf")))

Delete

import pathlib
import shutil
import os
root_dir = pathlib.Path("./downloads/")for d in list(root_dir.iterdir()):src = d/"Source_File.zip"os.remove(src)

NLPCC 出版部分相关源码记录

目录 Download Unzip Author Title Affiliation Check number of tex Zip Rename Delete Download import requests from bs4 import BeautifulSoup# 登录网站并获取登录后的 session def login(username, password):login_url https://example.com/loginsession re…...

编程日记 2023/8/9 11:21:25

【Windbg】通过网络调试windows内核

环境 windows版本：win10_x64 1901 windbg版本：1.2306.12001.0 HOST 1、windbg软件设置。点击菜单文件，然后如下图操作。 2、等待连接。 ************* Waiting for Debugger Extensions Gallery to Initialize **************>>&…...

编程日记 2023/8/9 11:20:24

代码随想录算法训练营之JAVA｜第二十四天| 93. 复原 IP 地址

今天是第24天刷leetcode，立个flag，打卡60天。算法挑战链接 93. 复原 IP 地址https://leetcode.cn/problems/restore-ip-addresses/ 第一想法题目理解：将一串数字字符串变成正确的ip格式的字符串。这类题目是切分字符串，ip一…...

编程日记 2023/8/9 11:19:23

网络安全 Day30-运维安全项目-堡垒机部署

运维安全项目-堡垒机部署 1. 运维安全项目-架构概述2. 运维安全项目之堡垒机2.1 堡垒机概述2.2 堡垒机选型2.3 环境准备2.4 部署Teleport堡垒机2.4.1 下载与部署2.4.2 启动2.4.3 浏览器访问teleport2.4.4 进行配置2.4.5 安装teleport客户端 2.5 teleport连接服务器 1. 运维安全…...

编程日记 2023/8/9 11:18:21

电脑文件夹备份命令

电脑文件夹备份 cmd窗口输入shell:startup 将备份.bat文件放到，自启动文件夹下 bat文件内容写以下就可以了 Xcopy "D:\文件\" "F:\文件备份\" /E/H/C/I/y...

编程日记 2023/8/9 11:17:20

RocketMQ Learning(一)

目录一、RocketMQ 0、RocketMQ的产品发展 1、RocketMQ安装 1.1、windows下的安装注意事项 1.2、Linux下的安装 1.3、源码的安装 1.4、控制台 2、消息发送方式 2.1、发送同步消息 2.2、发送异步消息 2.3、单向发送 3、消息消费方式 3.1、负载均衡模式&#xff0…...

编程日记 2023/8/9 11:16:18

libmpv使用滤镜处理视频进行播放

一、前言作为一个功能强大的多媒体框架，libmpv为开发者提供了广泛的功能和灵活的控制权。滤镜是libmpv的一个重要特性，允许开发者对视频进行各种实时处理和增强，从而满足用户对于个性化、创意化和高质量视频体验的需求。滤镜是一种在视频渲染过程中应用特定效果的技术。…...

编程日记 2023/8/9 11:15:17

Harbor.cfg 配置文件参数详解

目录 Harbor.cfg 配置文件参数详解所需参数： hostname： ui_url_protocol： max_job_workers： db_password： customize_crt： ssl_cert： ssl_cert_key： secretkey_path&#…...

编程日记 2023/8/9 11:14:16

模仿火星科技基于cesium+ 贴地测量+可编辑

当您进入Cesium的编辑贴地测量世界，下面是一个详细的操作过程，帮助您顺利使用这些功能： 1. 创建提示窗： 启动Cesium应用，地图场景将打开，欢迎您进入编辑模式。在屏幕的一角，一个友好的提示窗将…...

编程日记 2023/8/9 11:13:12

模仿火星科技基于cesium+角度测量+高度测量+可编辑

1. 创建提示窗： 启动Cesium应用，地图场景将打开，欢迎您进入编辑模式。在屏幕的一角，一个友好的提示窗将呈现，随着您的操作，它会为您提供有用的信息和指导。 2. 绘制面积： 轻轻点击鼠标左键&a…...

编程日记 2023/8/9 11:12:11

Codeforces の动态规划

Codeforces Round 785 (Div. 2) - C. Palindrome Basis dp(9/100) 题目链接思路：整数划分基础上加一个判断回文的条件整数划分思路：背包容量为n，物品有体积为1~n n种，每种无数个，求使背包恰好装满的方案数——完全背…...

编程日记 2023/8/9 11:11:10

数学建模-爬虫系统学习

尚硅谷Python爬虫教程小白零基础速通（含python基础爬虫案例） 内容包括：Python基础、Urllib、解析（xpath、jsonpath、beautiful）、requests、selenium、Scrapy框架 python基础进阶（字符串列表元组字典…...

编程日记 2023/8/9 11:10:09

HarmonyOS/OpenHarmony应用开发-ArkTS语言渲染控制概述

ArkUI通过自定义组件的build()函数和builder装饰器中的声明式UI描述语句构建相应的UI。在声明式描述语句中开发者除了使用系统组件外，还可以使用渲染控制语句来辅助UI的构建，这些渲染控制语句包括控制组件是否显示的条件渲染语句，基于数组数…...

编程日记 2023/8/9 11:09:08

【力扣刷题 | 第二十五天】

目录前言： 474. 一和零 - 力扣（LeetCode） 总结: 前言： 今天我们依旧暴打动态规划 474. 一和零 - 力扣（LeetCode） 给你一个二进制字符串数组 strs 和两个整数 m 和 n 。请你找出并返回 strs 的最大子集…...

编程日记 2023/8/9 11:08:07

GO学习之函数(Function)

GO系列 1、GO学习之Hello World 2、GO学习之入门语法 3、GO学习之切片操作 4、GO学习之 Map 操作 5、GO学习之结构体操作 6、GO学习之通道(Channel) 7、GO学习之多线程(goroutine) 8、GO学习之函数(Function) 9、GO学习之接口(Interface) 文章目录 GO系列前言一、什么是…...

编程日记 2023/8/9 11:07:06

Jstack线上问题排查

1.top查找出哪个进程消耗的cpu高。执行top命令，默认是进程视图，其中PID是进程号（记下进程号） 2.top中shifth 或“H”查找出哪个线程消耗的cpu高 （记下最高的几个线程号） jstack 进程号 >> pid-cpu.…...

编程日记 2023/8/9 11:06:03

VIM 编辑器： Bram Moolenaar

VIM 用了很长时间， 个人的 VIM 配置文件差不多10年没有更新了。以前写程序的时候， 编辑都用这个。 linux kernel， boost规模的代码都不在话下。现在虽然代码写的少了，依然是我打开文件的首选。现在用手机了，配个蓝牙键…...

编程日记 2023/8/9 11:05:03

鸿蒙应用开发指南：从零开始构建一款智能音乐播放器

介绍随着鸿蒙操作系统的发布，开发者们迫不及待地想要探索鸿蒙应用的开发。本篇博客将以构建一款智能音乐播放器为例，带你一步步了解鸿蒙应用开发的技术要点和实践。我们将使用HarmonyOS的开发环境和MarkDown进行排版，方便你快速上手。准备…...

编程日记 2023/8/9 11:04:02

如何实现对主机的立体监控？

主机监控是保证系统稳定性和性能的重要环节之一，那应该如何实现对主机的立体监控？ 本期EasyOps产品使用最佳实践，我们将为您揭晓： 主机应该如何分组和管理？ 主机监控应该关注哪些关键性指标？ 背景通…...

编程日记 2023/8/9 11:03:00

机器学习笔记：李宏毅ChatGPT Finetune VS Prompt

1 两种大语言模型：GPT VS BERT 2 对于大语言模型的两种不同期待 2.1 “专才” 2.1.1 成为专才的好处 Is ChatGPT A Good Translator? A Preliminary Study 2023 Arxiv 箭头方向指的是从哪个方向往哪个方向翻译表格里面的数值越大表示翻译的越好可以发现专门做翻…...

编程日记 2023/8/9 11:01:59

eNSP-Cloud(实现本地电脑与eNSP内设备之间通信)

说明： 想象一下，你正在用eNSP搭建一个虚拟的网络世界，里面有虚拟的路由器、交换机、电脑（PC）等等。这些设备都在你的电脑里面“运行”，它们之间可以互相通信，就像一个封闭的小王国。但是&#…...

编程新知 2025/7/15 21:38:30

谷歌浏览器插件

项目中有时候会用到插件 sync-cookie-extension1.0.0：开发环境同步测试 cookie 至 localhost，便于本地请求服务携带 cookie 参考地址：https://juejin.cn/post/7139354571712757767 里面有源码下载下来，加在到扩展即可使用FeHelp…...

编程新知 2025/6/25 22:42:56

Java 语言特性(面试系列2)

一、SQL 基础 1. 复杂查询 （1）连接查询（JOIN） 内连接（INNER JOIN）：返回两表匹配的记录。 SELECT e.name, d.dept_name FROM employees e INNER JOIN departments d ON e.dept_id d.dept_id; 左…...

编程新知 2025/7/12 7:16:15

智慧工地云平台源码，基于微服务架构+Java+Spring Cloud +UniApp +MySql

智慧工地管理云平台系统，智慧工地全套源码，java版智慧工地源码，支持PC端、大屏端、移动端。智慧工地聚焦建筑行业的市场需求，提供“平台网络终端”的整体解决方案，提供劳务管理、视频管理、智能监测、绿色施工、安全管…...

编程新知 2025/7/16 0:27:38

FFmpeg 低延迟同屏方案

引言在实时互动需求激增的当下，无论是在线教育中的师生同屏演示、远程办公的屏幕共享协作，还是游戏直播的画面实时传输，低延迟同屏已成为保障用户体验的核心指标。FFmpeg 作为一款功能强大的多媒体框架，凭借其灵活的编解码、数据…...

编程新知 2025/6/21 15:38:27

dedecms 织梦自定义表单留言增加ajax验证码功能

增加ajax功能模块，用户不点击提交按钮，只要输入框失去焦点，就会提前提示验证码是否正确。一，模板上增加验证码 <input name"vdcode"id"vdcode" placeholder"请输入验证码" type"text&quo…...

编程新知 2025/7/14 18:25:26

Android Bitmap治理全解析：从加载优化到泄漏防控的全生命周期管理

引言 Bitmap（位图）是Android应用内存占用的“头号杀手”。一张1080P（1920x1080）的图片以ARGB_8888格式加载时，内存占用高达8MB（192010804字节）。据统计，超过60%的应用OOM崩溃与Bitm…...

编程新知 2025/6/22 8:50:50

laravel8+vue3.0+element-plus搭建方法

创建 laravel8 项目 composer create-project --prefer-dist laravel/laravel laravel8 8.* 安装 laravel/ui composer require laravel/ui 修改 package.json 文件 "devDependencies": {"vue/compiler-sfc": "^3.0.7","axios": …...

编程新知 2025/7/9 16:47:32