LISHUZUOXUN_yangjiang/score_doc/readPDF.py

192 lines
12 KiB
Python
Raw Permalink Normal View History

2024-09-23 14:54:15 +08:00
import pdfplumber
import pandas as pd
import numpy as np
pdf = pdfplumber.open('军事体育五项标准.pdf', password='456456')
# 定义各指标表格页
manBodyShape_Page_list = [0, 1]
womanBodyShape_Page_list = [2, 3]
manBodyChinUpsPushUps_list = [5]
manSitUps_list = [6]
man60mSerpentineRun_list = [7]
man3000mRun_list = [8]
womanBodyArmDrapePushUps_list = [9]
womanSitUps_list = [10]
woman60mSerpentineRun_list = [11]
woman3000mRun_list = [12]
# 解析manBodyShape
manBodyShape_comparisonTable_list = [list(filter(None, _i)) for item in manBodyShape_Page_list for _i in
pdf.pages[item].extract_table()]
manBodyShape_columnName_list = manBodyShape_comparisonTable_list[0][2:]
manBodyShape_columnName_list[0] = '0~24岁'
manBodyShape_comparisonTable_ar = np.array([item for item in manBodyShape_comparisonTable_list if
'公式' not in item and '说明' not in item and r'24岁以下' not in item])
manBodyShape_rowName_ar = np.array([item.replace(' ', '') for item in manBodyShape_comparisonTable_ar[:, 0]]).astype(
float)
manBodyShape_data_ar = np.array([[[float(_j) for _j in _i.replace(' ', '').split("~")] for _i in item] for item in
manBodyShape_comparisonTable_ar[:, 1:]])
manBodyShape_df = pd.DataFrame(columns=manBodyShape_columnName_list, index=manBodyShape_rowName_ar)
for index, col in enumerate(manBodyShape_columnName_list):
manBodyShape_df[col] = list(manBodyShape_data_ar[:, index])
manBodyShape_df.to_excel('manBodyShape.xlsx')
# 解析womanBodyShape
womanBodyShape_comparisonTable_list = [list(filter(None, _i)) for item in womanBodyShape_Page_list for _i in
pdf.pages[item].extract_table()]
womanBodyShape_columnName_list = womanBodyShape_comparisonTable_list[0][2:]
womanBodyShape_columnName_list[0] = '0~24岁'
womanBodyShape_comparisonTable_ar = np.array([item for item in womanBodyShape_comparisonTable_list if
'公式' not in item and '说明' not in item and r'24岁以下' not in item])
womanBodyShape_rowName_ar = np.array(
[item.replace(' ', '') for item in womanBodyShape_comparisonTable_ar[:, 0]]).astype(
float)
womanBodyShape_data_ar = np.array([[[float(_j) for _j in _i.replace(' ', '').split("~")] for _i in item] for item in
womanBodyShape_comparisonTable_ar[:, 1:]])
womanBodyShape_df = pd.DataFrame(columns=womanBodyShape_columnName_list, index=womanBodyShape_rowName_ar)
for index, col in enumerate(womanBodyShape_columnName_list):
womanBodyShape_df[col] = list(womanBodyShape_data_ar[:, index])
womanBodyShape_df.to_excel('womanBodyShape.xlsx')
# 解析manBodyChinUps&PushUps
manBodyChinUpsPushUps_comparisonTable_list = [list(filter(None, _i)) for item in manBodyChinUpsPushUps_list for _i in
pdf.pages[item].extract_table()]
manBodyChinUpsPushUps_columnName_list = manBodyChinUpsPushUps_comparisonTable_list[0][1:]
manBodyChinUpsPushUps_columnName_list[0] = "0~24"
manBodyChinUpsPushUps_comparisonTable_ar = np.array([item for item in manBodyChinUpsPushUps_comparisonTable_list if
'25~27' not in item and '备注' not in item])
manBodyChinUpsPushUps_rowName_ar = np.array(
[item.replace(' ', '') for item in manBodyChinUpsPushUps_comparisonTable_ar[:, 0]]).astype(
int)
manBodyChinUpsPushUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in
manBodyChinUpsPushUps_comparisonTable_ar[:, 1:]])
manBodyChinUpsPushUps_df = pd.DataFrame(columns=manBodyChinUpsPushUps_columnName_list,
index=manBodyChinUpsPushUps_rowName_ar)
for index, col in enumerate(manBodyChinUpsPushUps_columnName_list):
manBodyChinUpsPushUps_df[col] = list(manBodyChinUpsPushUps_data_ar[:, index])
manBodyChinUpsPushUps_df.to_excel('manBodyChinUpsPushUps.xlsx')
# 解析manSitUps
manSitUps_comparisonTable_list = [list(filter(None, _i)) for item in manSitUps_list for _i in
pdf.pages[item].extract_table()]
manSitUps_columnName_list = manSitUps_comparisonTable_list[0][1:]
manSitUps_columnName_list[0] = "0~24"
manSitUps_comparisonTable_ar = np.array([item for item in manSitUps_comparisonTable_list if
'25~27' not in item and '备注' not in item])
manSitUps_rowName_ar = np.array(
[item.replace(' ', '') for item in manSitUps_comparisonTable_ar[:, 0]]).astype(
int)
manSitUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in
manSitUps_comparisonTable_ar[:, 1:]])
manSitUps_df = pd.DataFrame(columns=manSitUps_columnName_list,
index=manSitUps_rowName_ar)
for index, col in enumerate(manSitUps_columnName_list):
manSitUps_df[col] = list(manSitUps_data_ar[:, index])
manSitUps_df.to_excel('manSitUps.xlsx')
# 解析man60mSerpentineRun
man60mSerpentineRun_comparisonTable_list = [list(filter(None, _i)) for item in man60mSerpentineRun_list for _i in
pdf.pages[item].extract_table()]
man60mSerpentineRun_columnName_list = man60mSerpentineRun_comparisonTable_list[0][1:]
man60mSerpentineRun_columnName_list[0] = "0~24"
man60mSerpentineRun_comparisonTable_ar = np.array([item for item in man60mSerpentineRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
man60mSerpentineRun_rowName_ar = np.array(
[item.replace(' ', '') for item in man60mSerpentineRun_comparisonTable_ar[:, 0]]).astype(int)
man60mSerpentineRun_data_ar = np.array([[int(_i.split("")[0]) + int(_i.split("")[1]) * 0.1 for _i in item] for item in
man60mSerpentineRun_comparisonTable_ar[:, 1:]])
man60mSerpentineRun_df = pd.DataFrame(columns=man60mSerpentineRun_columnName_list, index=man60mSerpentineRun_rowName_ar)
for index, col in enumerate(man60mSerpentineRun_columnName_list):
man60mSerpentineRun_df[col] = list(man60mSerpentineRun_data_ar[:, index])
man60mSerpentineRun_df.to_excel('man60mSerpentineRun.xlsx')
# 解析man3000mRun
man3000mRun_comparisonTable_list = [list(filter(None, _i)) for item in man3000mRun_list for _i in
pdf.pages[item].extract_table()]
man3000mRun_columnName_list = man3000mRun_comparisonTable_list[0][1:]
man3000mRun_columnName_list[0] = "0~24"
man3000mRun_comparisonTable_ar = np.array([item for item in man3000mRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
man3000mRun_rowName_ar = np.array(
[item.replace(' ', '') for item in man3000mRun_comparisonTable_ar[:, 0]]).astype(int)
man3000mRun_data_ar = np.array([[int(_i.split("")[0]) * 60 + int(_i.split("")[1][:-1]) for _i in item] for item in
man3000mRun_comparisonTable_ar[:, 1:]])
man3000mRun_df = pd.DataFrame(columns=man3000mRun_columnName_list, index=man3000mRun_rowName_ar)
for index, col in enumerate(man3000mRun_columnName_list):
man3000mRun_df[col] = list(man3000mRun_data_ar[:, index])
man3000mRun_df.to_excel('man3000mRun.xlsx')
# 解析womanBodyArmDrape&PushUps
womanBodyArmDrapePushUps_comparisonTable_list = [list(filter(None, _i)) for item in womanBodyArmDrapePushUps_list for _i
in pdf.pages[item].extract_table()]
womanBodyArmDrapePushUps_columnName_list = womanBodyArmDrapePushUps_comparisonTable_list[0][1:]
womanBodyArmDrapePushUps_columnName_list[0] = "0~24"
womanBodyArmDrapePushUps_comparisonTable_ar = np.array(
[item for item in womanBodyArmDrapePushUps_comparisonTable_list if '25~27' not in item and '备注' not in item])
womanBodyArmDrapePushUps_rowName_ar = np.array(
[item.replace(' ', '') for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 0]]).astype(int)
womanArmDrape_data_ar = np.array([[int(_i.split("")[0]) * 60 + int(_i.split("")[1][:-1]) if len(
_i.split("")) > 1 else int(_i[:-1]) for _i in item] for item in
womanBodyArmDrapePushUps_comparisonTable_ar[:, 1:7]])
womanPushUps_data_ar = np.array(
[[int(_i.replace(' ', '')) for _i in item] for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 7:]])
womanBodyArmDrapePushUps_df = pd.DataFrame(columns=womanBodyArmDrapePushUps_columnName_list,
index=womanBodyArmDrapePushUps_rowName_ar)
womanBodyArmDrapePushUps_data_ar = np.hstack((womanArmDrape_data_ar, womanPushUps_data_ar))
for index, col in enumerate(womanBodyArmDrapePushUps_columnName_list):
womanBodyArmDrapePushUps_df[col] = list(womanBodyArmDrapePushUps_data_ar[:, index])
womanBodyArmDrapePushUps_df.to_excel('womanBodyArmDrapePushUps.xlsx')
# 解析womanSitUps
womanSitUps_comparisonTable_list = [list(filter(None, _i)) for item in womanSitUps_list for _i in
pdf.pages[item].extract_table()]
womanSitUps_columnName_list = womanSitUps_comparisonTable_list[0][1:]
womanSitUps_columnName_list[0] = "0~24"
womanSitUps_comparisonTable_ar = np.array([item for item in womanSitUps_comparisonTable_list if
'25~27' not in item and '备注' not in item])
womanSitUps_rowName_ar = np.array(
[item.replace(' ', '') for item in womanSitUps_comparisonTable_ar[:, 0]]).astype(
int)
womanSitUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in
womanSitUps_comparisonTable_ar[:, 1:]])
womanSitUps_df = pd.DataFrame(columns=womanSitUps_columnName_list,
index=womanSitUps_rowName_ar)
for index, col in enumerate(womanSitUps_columnName_list):
womanSitUps_df[col] = list(womanSitUps_data_ar[:, index])
womanSitUps_df.to_excel('womanSitUps.xlsx')
# 解析woman60mSerpentineRun
woman60mSerpentineRun_comparisonTable_list = [list(filter(None, _i)) for item in woman60mSerpentineRun_list for _i in
pdf.pages[item].extract_table()]
woman60mSerpentineRun_columnName_list = woman60mSerpentineRun_comparisonTable_list[0][1:]
woman60mSerpentineRun_columnName_list[0] = "0~24"
woman60mSerpentineRun_comparisonTable_ar = np.array([item for item in woman60mSerpentineRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
woman60mSerpentineRun_rowName_ar = np.array(
[item.replace(' ', '') for item in woman60mSerpentineRun_comparisonTable_ar[:, 0]]).astype(int)
woman60mSerpentineRun_data_ar = np.array(
[[int(_i.split("")[0]) + int(_i.split("")[1]) * 0.1 for _i in item] for item in
woman60mSerpentineRun_comparisonTable_ar[:, 1:]])
woman60mSerpentineRun_df = pd.DataFrame(columns=woman60mSerpentineRun_columnName_list,
index=woman60mSerpentineRun_rowName_ar)
for index, col in enumerate(woman60mSerpentineRun_columnName_list):
woman60mSerpentineRun_df[col] = list(woman60mSerpentineRun_data_ar[:, index])
woman60mSerpentineRun_df.to_excel('woman60mSerpentineRun.xlsx')
# 解析woman3000mRun
woman3000mRun_comparisonTable_list = [list(filter(None, _i)) for item in woman3000mRun_list for _i in
pdf.pages[item].extract_table()]
woman3000mRun_columnName_list = woman3000mRun_comparisonTable_list[0][1:]
woman3000mRun_columnName_list[0] = "0~24"
woman3000mRun_comparisonTable_ar = np.array([item for item in woman3000mRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
woman3000mRun_rowName_ar = np.array(
[item.replace(' ', '') for item in woman3000mRun_comparisonTable_ar[:, 0]]).astype(int)
woman3000mRun_data_ar = np.array([[int(_i.split("")[0]) * 60 + int(_i.split("")[1][:-1]) for _i in item] for item in
woman3000mRun_comparisonTable_ar[:, 1:]])
woman3000mRun_df = pd.DataFrame(columns=woman3000mRun_columnName_list, index=woman3000mRun_rowName_ar)
for index, col in enumerate(woman3000mRun_columnName_list):
woman3000mRun_df[col] = list(woman3000mRun_data_ar[:, index])
woman3000mRun_df.to_excel('woman3000mRun.xlsx')