import pdfplumber import pandas as pd import numpy as np pdf = pdfplumber.open('军事体育五项标准.pdf', password='456456') # 定义各指标表格页 manBodyShape_Page_list = [0, 1] womanBodyShape_Page_list = [2, 3] manBodyChinUpsPushUps_list = [5] manSitUps_list = [6] man60mSerpentineRun_list = [7] man3000mRun_list = [8] womanBodyArmDrapePushUps_list = [9] womanSitUps_list = [10] woman60mSerpentineRun_list = [11] woman3000mRun_list = [12] # 解析manBodyShape manBodyShape_comparisonTable_list = [list(filter(None, _i)) for item in manBodyShape_Page_list for _i in pdf.pages[item].extract_table()] manBodyShape_columnName_list = manBodyShape_comparisonTable_list[0][2:] manBodyShape_columnName_list[0] = '0~24岁' manBodyShape_comparisonTable_ar = np.array([item for item in manBodyShape_comparisonTable_list if '公式' not in item and '说明' not in item and r'24岁以下' not in item]) manBodyShape_rowName_ar = np.array([item.replace(' ', '') for item in manBodyShape_comparisonTable_ar[:, 0]]).astype( float) manBodyShape_data_ar = np.array([[[float(_j) for _j in _i.replace(' ', '').split("~")] for _i in item] for item in manBodyShape_comparisonTable_ar[:, 1:]]) manBodyShape_df = pd.DataFrame(columns=manBodyShape_columnName_list, index=manBodyShape_rowName_ar) for index, col in enumerate(manBodyShape_columnName_list): manBodyShape_df[col] = list(manBodyShape_data_ar[:, index]) manBodyShape_df.to_excel('manBodyShape.xlsx') # 解析womanBodyShape womanBodyShape_comparisonTable_list = [list(filter(None, _i)) for item in womanBodyShape_Page_list for _i in pdf.pages[item].extract_table()] womanBodyShape_columnName_list = womanBodyShape_comparisonTable_list[0][2:] womanBodyShape_columnName_list[0] = '0~24岁' womanBodyShape_comparisonTable_ar = np.array([item for item in womanBodyShape_comparisonTable_list if '公式' not in item and '说明' not in item and r'24岁以下' not in item]) womanBodyShape_rowName_ar = np.array( [item.replace(' ', '') for item in womanBodyShape_comparisonTable_ar[:, 0]]).astype( float) womanBodyShape_data_ar = np.array([[[float(_j) for _j in _i.replace(' ', '').split("~")] for _i in item] for item in womanBodyShape_comparisonTable_ar[:, 1:]]) womanBodyShape_df = pd.DataFrame(columns=womanBodyShape_columnName_list, index=womanBodyShape_rowName_ar) for index, col in enumerate(womanBodyShape_columnName_list): womanBodyShape_df[col] = list(womanBodyShape_data_ar[:, index]) womanBodyShape_df.to_excel('womanBodyShape.xlsx') # 解析manBodyChinUps&PushUps manBodyChinUpsPushUps_comparisonTable_list = [list(filter(None, _i)) for item in manBodyChinUpsPushUps_list for _i in pdf.pages[item].extract_table()] manBodyChinUpsPushUps_columnName_list = manBodyChinUpsPushUps_comparisonTable_list[0][1:] manBodyChinUpsPushUps_columnName_list[0] = "0~24" manBodyChinUpsPushUps_comparisonTable_ar = np.array([item for item in manBodyChinUpsPushUps_comparisonTable_list if '25~27' not in item and '备注' not in item]) manBodyChinUpsPushUps_rowName_ar = np.array( [item.replace(' ', '') for item in manBodyChinUpsPushUps_comparisonTable_ar[:, 0]]).astype( int) manBodyChinUpsPushUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in manBodyChinUpsPushUps_comparisonTable_ar[:, 1:]]) manBodyChinUpsPushUps_df = pd.DataFrame(columns=manBodyChinUpsPushUps_columnName_list, index=manBodyChinUpsPushUps_rowName_ar) for index, col in enumerate(manBodyChinUpsPushUps_columnName_list): manBodyChinUpsPushUps_df[col] = list(manBodyChinUpsPushUps_data_ar[:, index]) manBodyChinUpsPushUps_df.to_excel('manBodyChinUpsPushUps.xlsx') # 解析manSitUps manSitUps_comparisonTable_list = [list(filter(None, _i)) for item in manSitUps_list for _i in pdf.pages[item].extract_table()] manSitUps_columnName_list = manSitUps_comparisonTable_list[0][1:] manSitUps_columnName_list[0] = "0~24" manSitUps_comparisonTable_ar = np.array([item for item in manSitUps_comparisonTable_list if '25~27' not in item and '备注' not in item]) manSitUps_rowName_ar = np.array( [item.replace(' ', '') for item in manSitUps_comparisonTable_ar[:, 0]]).astype( int) manSitUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in manSitUps_comparisonTable_ar[:, 1:]]) manSitUps_df = pd.DataFrame(columns=manSitUps_columnName_list, index=manSitUps_rowName_ar) for index, col in enumerate(manSitUps_columnName_list): manSitUps_df[col] = list(manSitUps_data_ar[:, index]) manSitUps_df.to_excel('manSitUps.xlsx') # 解析man60mSerpentineRun man60mSerpentineRun_comparisonTable_list = [list(filter(None, _i)) for item in man60mSerpentineRun_list for _i in pdf.pages[item].extract_table()] man60mSerpentineRun_columnName_list = man60mSerpentineRun_comparisonTable_list[0][1:] man60mSerpentineRun_columnName_list[0] = "0~24" man60mSerpentineRun_comparisonTable_ar = np.array([item for item in man60mSerpentineRun_comparisonTable_list if '25~27' not in item and '备注' not in item]) man60mSerpentineRun_rowName_ar = np.array( [item.replace(' ', '') for item in man60mSerpentineRun_comparisonTable_ar[:, 0]]).astype(int) man60mSerpentineRun_data_ar = np.array([[int(_i.split("″")[0]) + int(_i.split("″")[1]) * 0.1 for _i in item] for item in man60mSerpentineRun_comparisonTable_ar[:, 1:]]) man60mSerpentineRun_df = pd.DataFrame(columns=man60mSerpentineRun_columnName_list, index=man60mSerpentineRun_rowName_ar) for index, col in enumerate(man60mSerpentineRun_columnName_list): man60mSerpentineRun_df[col] = list(man60mSerpentineRun_data_ar[:, index]) man60mSerpentineRun_df.to_excel('man60mSerpentineRun.xlsx') # 解析man3000mRun man3000mRun_comparisonTable_list = [list(filter(None, _i)) for item in man3000mRun_list for _i in pdf.pages[item].extract_table()] man3000mRun_columnName_list = man3000mRun_comparisonTable_list[0][1:] man3000mRun_columnName_list[0] = "0~24" man3000mRun_comparisonTable_ar = np.array([item for item in man3000mRun_comparisonTable_list if '25~27' not in item and '备注' not in item]) man3000mRun_rowName_ar = np.array( [item.replace(' ', '') for item in man3000mRun_comparisonTable_ar[:, 0]]).astype(int) man3000mRun_data_ar = np.array([[int(_i.split("′")[0]) * 60 + int(_i.split("′")[1][:-1]) for _i in item] for item in man3000mRun_comparisonTable_ar[:, 1:]]) man3000mRun_df = pd.DataFrame(columns=man3000mRun_columnName_list, index=man3000mRun_rowName_ar) for index, col in enumerate(man3000mRun_columnName_list): man3000mRun_df[col] = list(man3000mRun_data_ar[:, index]) man3000mRun_df.to_excel('man3000mRun.xlsx') # 解析womanBodyArmDrape&PushUps womanBodyArmDrapePushUps_comparisonTable_list = [list(filter(None, _i)) for item in womanBodyArmDrapePushUps_list for _i in pdf.pages[item].extract_table()] womanBodyArmDrapePushUps_columnName_list = womanBodyArmDrapePushUps_comparisonTable_list[0][1:] womanBodyArmDrapePushUps_columnName_list[0] = "0~24" womanBodyArmDrapePushUps_comparisonTable_ar = np.array( [item for item in womanBodyArmDrapePushUps_comparisonTable_list if '25~27' not in item and '备注' not in item]) womanBodyArmDrapePushUps_rowName_ar = np.array( [item.replace(' ', '') for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 0]]).astype(int) womanArmDrape_data_ar = np.array([[int(_i.split("′")[0]) * 60 + int(_i.split("′")[1][:-1]) if len( _i.split("′")) > 1 else int(_i[:-1]) for _i in item] for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 1:7]]) womanPushUps_data_ar = np.array( [[int(_i.replace(' ', '')) for _i in item] for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 7:]]) womanBodyArmDrapePushUps_df = pd.DataFrame(columns=womanBodyArmDrapePushUps_columnName_list, index=womanBodyArmDrapePushUps_rowName_ar) womanBodyArmDrapePushUps_data_ar = np.hstack((womanArmDrape_data_ar, womanPushUps_data_ar)) for index, col in enumerate(womanBodyArmDrapePushUps_columnName_list): womanBodyArmDrapePushUps_df[col] = list(womanBodyArmDrapePushUps_data_ar[:, index]) womanBodyArmDrapePushUps_df.to_excel('womanBodyArmDrapePushUps.xlsx') # 解析womanSitUps womanSitUps_comparisonTable_list = [list(filter(None, _i)) for item in womanSitUps_list for _i in pdf.pages[item].extract_table()] womanSitUps_columnName_list = womanSitUps_comparisonTable_list[0][1:] womanSitUps_columnName_list[0] = "0~24" womanSitUps_comparisonTable_ar = np.array([item for item in womanSitUps_comparisonTable_list if '25~27' not in item and '备注' not in item]) womanSitUps_rowName_ar = np.array( [item.replace(' ', '') for item in womanSitUps_comparisonTable_ar[:, 0]]).astype( int) womanSitUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in womanSitUps_comparisonTable_ar[:, 1:]]) womanSitUps_df = pd.DataFrame(columns=womanSitUps_columnName_list, index=womanSitUps_rowName_ar) for index, col in enumerate(womanSitUps_columnName_list): womanSitUps_df[col] = list(womanSitUps_data_ar[:, index]) womanSitUps_df.to_excel('womanSitUps.xlsx') # 解析woman60mSerpentineRun woman60mSerpentineRun_comparisonTable_list = [list(filter(None, _i)) for item in woman60mSerpentineRun_list for _i in pdf.pages[item].extract_table()] woman60mSerpentineRun_columnName_list = woman60mSerpentineRun_comparisonTable_list[0][1:] woman60mSerpentineRun_columnName_list[0] = "0~24" woman60mSerpentineRun_comparisonTable_ar = np.array([item for item in woman60mSerpentineRun_comparisonTable_list if '25~27' not in item and '备注' not in item]) woman60mSerpentineRun_rowName_ar = np.array( [item.replace(' ', '') for item in woman60mSerpentineRun_comparisonTable_ar[:, 0]]).astype(int) woman60mSerpentineRun_data_ar = np.array( [[int(_i.split("″")[0]) + int(_i.split("″")[1]) * 0.1 for _i in item] for item in woman60mSerpentineRun_comparisonTable_ar[:, 1:]]) woman60mSerpentineRun_df = pd.DataFrame(columns=woman60mSerpentineRun_columnName_list, index=woman60mSerpentineRun_rowName_ar) for index, col in enumerate(woman60mSerpentineRun_columnName_list): woman60mSerpentineRun_df[col] = list(woman60mSerpentineRun_data_ar[:, index]) woman60mSerpentineRun_df.to_excel('woman60mSerpentineRun.xlsx') # 解析woman3000mRun woman3000mRun_comparisonTable_list = [list(filter(None, _i)) for item in woman3000mRun_list for _i in pdf.pages[item].extract_table()] woman3000mRun_columnName_list = woman3000mRun_comparisonTable_list[0][1:] woman3000mRun_columnName_list[0] = "0~24" woman3000mRun_comparisonTable_ar = np.array([item for item in woman3000mRun_comparisonTable_list if '25~27' not in item and '备注' not in item]) woman3000mRun_rowName_ar = np.array( [item.replace(' ', '') for item in woman3000mRun_comparisonTable_ar[:, 0]]).astype(int) woman3000mRun_data_ar = np.array([[int(_i.split("′")[0]) * 60 + int(_i.split("′")[1][:-1]) for _i in item] for item in woman3000mRun_comparisonTable_ar[:, 1:]]) woman3000mRun_df = pd.DataFrame(columns=woman3000mRun_columnName_list, index=woman3000mRun_rowName_ar) for index, col in enumerate(woman3000mRun_columnName_list): woman3000mRun_df[col] = list(woman3000mRun_data_ar[:, index]) woman3000mRun_df.to_excel('woman3000mRun.xlsx')