|
|
import pandas as pd
|
|
|
import os
|
|
|
import sys
|
|
|
|
|
|
|
|
|
def process_single_file(reference_file_path, file_path, output_folder):
|
|
|
"""
|
|
|
处理单个 Excel 文件。
|
|
|
|
|
|
:param reference_file_path: 参考文件路径,用于获取表头
|
|
|
:param file_path: 待处理文件路径
|
|
|
:param output_folder: 输出文件夹路径
|
|
|
"""
|
|
|
# 读取参考文件获取表头
|
|
|
df1 = pd.read_excel(reference_file_path, parse_dates=False)
|
|
|
header = df1.columns
|
|
|
|
|
|
# 读取第二个文件,设置日期格式保持不变
|
|
|
df2 = pd.read_excel(file_path, parse_dates=False)
|
|
|
|
|
|
# 设置待处理文件的表头与参考文件一致
|
|
|
df2.columns = header
|
|
|
|
|
|
# 删除包含空白值的行
|
|
|
# df2 = df2.dropna(how='any')
|
|
|
|
|
|
# 检查最后一行第一列数据是否为'数据来源:东方财富Choice数据',如果是则删除该行
|
|
|
if not df2.empty and df2.iloc[-1, 0] == '数据来源:东方财富Choice数据':
|
|
|
df2 = df2[:-1]
|
|
|
|
|
|
# 处理指定日期列(首发上市日),只保留日期部分
|
|
|
if '首发上市日' in df2.columns:
|
|
|
df2['首发上市日'] = pd.to_datetime(df2['首发上市日'].dt.date)
|
|
|
|
|
|
# 构建保存文件的路径
|
|
|
file_name = os.path.basename(file_path)
|
|
|
output_file_path = os.path.join(output_folder, file_name)
|
|
|
|
|
|
# 保存处理后的文件
|
|
|
df2.to_excel(output_file_path, index=False)
|
|
|
|
|
|
|
|
|
def batch_process_files(reference_file_path, input_folder, output_folder):
|
|
|
"""
|
|
|
批量处理文件夹中的 Excel 文件。
|
|
|
|
|
|
:param reference_file_path: 参考文件路径,用于获取表头
|
|
|
:param input_folder: 输入文件夹路径,包含待处理的文件
|
|
|
:param output_folder: 输出文件夹路径,保存处理后的文件
|
|
|
"""
|
|
|
# 确保输出文件夹存在
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
|
# 遍历输入文件夹中的所有文件
|
|
|
for file_name in os.listdir(input_folder):
|
|
|
file_path = os.path.join(input_folder, file_name)
|
|
|
|
|
|
# 检查文件是否为 Excel 文件且符合命名规则
|
|
|
if os.path.isfile(file_path) and file_path.endswith(('.xlsx', '.xls')) and '动量原始股(全部A股)' in file_name:
|
|
|
process_single_file(reference_file_path, file_path, output_folder)
|
|
|
print(f"Processed file: {file_name} successfully.")
|
|
|
|
|
|
# C:\Users\winds\Desktop\batch_process_xls\model.xlsx C:\Users\winds\Desktop\batch_process_xls\inputfile C:\Users\winds\Desktop\batch_process_xls\outputfile
|
|
|
if __name__ == "__main__":
|
|
|
if len(sys.argv)!= 4:
|
|
|
print("Usage: python batch_process_excel.py reference_file_path input_folder output_folder")
|
|
|
sys.exit(1)
|
|
|
|
|
|
reference_file_path = sys.argv[1]
|
|
|
input_folder = sys.argv[2]
|
|
|
output_folder = sys.argv[3]
|
|
|
|
|
|
batch_process_files(reference_file_path, input_folder, output_folder)
|
|
|
print(f"Batch processing completed successfully.") |