LISHUZUOXUN_yangjiang/score_doc/readPDF.py

192 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pdfplumber
import pandas as pd
import numpy as np
pdf = pdfplumber.open('军事体育五项标准.pdf', password='456456')
# 定义各指标表格页
manBodyShape_Page_list = [0, 1]
womanBodyShape_Page_list = [2, 3]
manBodyChinUpsPushUps_list = [5]
manSitUps_list = [6]
man60mSerpentineRun_list = [7]
man3000mRun_list = [8]
womanBodyArmDrapePushUps_list = [9]
womanSitUps_list = [10]
woman60mSerpentineRun_list = [11]
woman3000mRun_list = [12]
# 解析manBodyShape
manBodyShape_comparisonTable_list = [list(filter(None, _i)) for item in manBodyShape_Page_list for _i in
pdf.pages[item].extract_table()]
manBodyShape_columnName_list = manBodyShape_comparisonTable_list[0][2:]
manBodyShape_columnName_list[0] = '0~24岁'
manBodyShape_comparisonTable_ar = np.array([item for item in manBodyShape_comparisonTable_list if
'公式' not in item and '说明' not in item and r'24岁以下' not in item])
manBodyShape_rowName_ar = np.array([item.replace(' ', '') for item in manBodyShape_comparisonTable_ar[:, 0]]).astype(
float)
manBodyShape_data_ar = np.array([[[float(_j) for _j in _i.replace(' ', '').split("~")] for _i in item] for item in
manBodyShape_comparisonTable_ar[:, 1:]])
manBodyShape_df = pd.DataFrame(columns=manBodyShape_columnName_list, index=manBodyShape_rowName_ar)
for index, col in enumerate(manBodyShape_columnName_list):
manBodyShape_df[col] = list(manBodyShape_data_ar[:, index])
manBodyShape_df.to_excel('manBodyShape.xlsx')
# 解析womanBodyShape
womanBodyShape_comparisonTable_list = [list(filter(None, _i)) for item in womanBodyShape_Page_list for _i in
pdf.pages[item].extract_table()]
womanBodyShape_columnName_list = womanBodyShape_comparisonTable_list[0][2:]
womanBodyShape_columnName_list[0] = '0~24岁'
womanBodyShape_comparisonTable_ar = np.array([item for item in womanBodyShape_comparisonTable_list if
'公式' not in item and '说明' not in item and r'24岁以下' not in item])
womanBodyShape_rowName_ar = np.array(
[item.replace(' ', '') for item in womanBodyShape_comparisonTable_ar[:, 0]]).astype(
float)
womanBodyShape_data_ar = np.array([[[float(_j) for _j in _i.replace(' ', '').split("~")] for _i in item] for item in
womanBodyShape_comparisonTable_ar[:, 1:]])
womanBodyShape_df = pd.DataFrame(columns=womanBodyShape_columnName_list, index=womanBodyShape_rowName_ar)
for index, col in enumerate(womanBodyShape_columnName_list):
womanBodyShape_df[col] = list(womanBodyShape_data_ar[:, index])
womanBodyShape_df.to_excel('womanBodyShape.xlsx')
# 解析manBodyChinUps&PushUps
manBodyChinUpsPushUps_comparisonTable_list = [list(filter(None, _i)) for item in manBodyChinUpsPushUps_list for _i in
pdf.pages[item].extract_table()]
manBodyChinUpsPushUps_columnName_list = manBodyChinUpsPushUps_comparisonTable_list[0][1:]
manBodyChinUpsPushUps_columnName_list[0] = "0~24"
manBodyChinUpsPushUps_comparisonTable_ar = np.array([item for item in manBodyChinUpsPushUps_comparisonTable_list if
'25~27' not in item and '备注' not in item])
manBodyChinUpsPushUps_rowName_ar = np.array(
[item.replace(' ', '') for item in manBodyChinUpsPushUps_comparisonTable_ar[:, 0]]).astype(
int)
manBodyChinUpsPushUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in
manBodyChinUpsPushUps_comparisonTable_ar[:, 1:]])
manBodyChinUpsPushUps_df = pd.DataFrame(columns=manBodyChinUpsPushUps_columnName_list,
index=manBodyChinUpsPushUps_rowName_ar)
for index, col in enumerate(manBodyChinUpsPushUps_columnName_list):
manBodyChinUpsPushUps_df[col] = list(manBodyChinUpsPushUps_data_ar[:, index])
manBodyChinUpsPushUps_df.to_excel('manBodyChinUpsPushUps.xlsx')
# 解析manSitUps
manSitUps_comparisonTable_list = [list(filter(None, _i)) for item in manSitUps_list for _i in
pdf.pages[item].extract_table()]
manSitUps_columnName_list = manSitUps_comparisonTable_list[0][1:]
manSitUps_columnName_list[0] = "0~24"
manSitUps_comparisonTable_ar = np.array([item for item in manSitUps_comparisonTable_list if
'25~27' not in item and '备注' not in item])
manSitUps_rowName_ar = np.array(
[item.replace(' ', '') for item in manSitUps_comparisonTable_ar[:, 0]]).astype(
int)
manSitUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in
manSitUps_comparisonTable_ar[:, 1:]])
manSitUps_df = pd.DataFrame(columns=manSitUps_columnName_list,
index=manSitUps_rowName_ar)
for index, col in enumerate(manSitUps_columnName_list):
manSitUps_df[col] = list(manSitUps_data_ar[:, index])
manSitUps_df.to_excel('manSitUps.xlsx')
# 解析man60mSerpentineRun
man60mSerpentineRun_comparisonTable_list = [list(filter(None, _i)) for item in man60mSerpentineRun_list for _i in
pdf.pages[item].extract_table()]
man60mSerpentineRun_columnName_list = man60mSerpentineRun_comparisonTable_list[0][1:]
man60mSerpentineRun_columnName_list[0] = "0~24"
man60mSerpentineRun_comparisonTable_ar = np.array([item for item in man60mSerpentineRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
man60mSerpentineRun_rowName_ar = np.array(
[item.replace(' ', '') for item in man60mSerpentineRun_comparisonTable_ar[:, 0]]).astype(int)
man60mSerpentineRun_data_ar = np.array([[int(_i.split("")[0]) + int(_i.split("")[1]) * 0.1 for _i in item] for item in
man60mSerpentineRun_comparisonTable_ar[:, 1:]])
man60mSerpentineRun_df = pd.DataFrame(columns=man60mSerpentineRun_columnName_list, index=man60mSerpentineRun_rowName_ar)
for index, col in enumerate(man60mSerpentineRun_columnName_list):
man60mSerpentineRun_df[col] = list(man60mSerpentineRun_data_ar[:, index])
man60mSerpentineRun_df.to_excel('man60mSerpentineRun.xlsx')
# 解析man3000mRun
man3000mRun_comparisonTable_list = [list(filter(None, _i)) for item in man3000mRun_list for _i in
pdf.pages[item].extract_table()]
man3000mRun_columnName_list = man3000mRun_comparisonTable_list[0][1:]
man3000mRun_columnName_list[0] = "0~24"
man3000mRun_comparisonTable_ar = np.array([item for item in man3000mRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
man3000mRun_rowName_ar = np.array(
[item.replace(' ', '') for item in man3000mRun_comparisonTable_ar[:, 0]]).astype(int)
man3000mRun_data_ar = np.array([[int(_i.split("")[0]) * 60 + int(_i.split("")[1][:-1]) for _i in item] for item in
man3000mRun_comparisonTable_ar[:, 1:]])
man3000mRun_df = pd.DataFrame(columns=man3000mRun_columnName_list, index=man3000mRun_rowName_ar)
for index, col in enumerate(man3000mRun_columnName_list):
man3000mRun_df[col] = list(man3000mRun_data_ar[:, index])
man3000mRun_df.to_excel('man3000mRun.xlsx')
# 解析womanBodyArmDrape&PushUps
womanBodyArmDrapePushUps_comparisonTable_list = [list(filter(None, _i)) for item in womanBodyArmDrapePushUps_list for _i
in pdf.pages[item].extract_table()]
womanBodyArmDrapePushUps_columnName_list = womanBodyArmDrapePushUps_comparisonTable_list[0][1:]
womanBodyArmDrapePushUps_columnName_list[0] = "0~24"
womanBodyArmDrapePushUps_comparisonTable_ar = np.array(
[item for item in womanBodyArmDrapePushUps_comparisonTable_list if '25~27' not in item and '备注' not in item])
womanBodyArmDrapePushUps_rowName_ar = np.array(
[item.replace(' ', '') for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 0]]).astype(int)
womanArmDrape_data_ar = np.array([[int(_i.split("")[0]) * 60 + int(_i.split("")[1][:-1]) if len(
_i.split("")) > 1 else int(_i[:-1]) for _i in item] for item in
womanBodyArmDrapePushUps_comparisonTable_ar[:, 1:7]])
womanPushUps_data_ar = np.array(
[[int(_i.replace(' ', '')) for _i in item] for item in womanBodyArmDrapePushUps_comparisonTable_ar[:, 7:]])
womanBodyArmDrapePushUps_df = pd.DataFrame(columns=womanBodyArmDrapePushUps_columnName_list,
index=womanBodyArmDrapePushUps_rowName_ar)
womanBodyArmDrapePushUps_data_ar = np.hstack((womanArmDrape_data_ar, womanPushUps_data_ar))
for index, col in enumerate(womanBodyArmDrapePushUps_columnName_list):
womanBodyArmDrapePushUps_df[col] = list(womanBodyArmDrapePushUps_data_ar[:, index])
womanBodyArmDrapePushUps_df.to_excel('womanBodyArmDrapePushUps.xlsx')
# 解析womanSitUps
womanSitUps_comparisonTable_list = [list(filter(None, _i)) for item in womanSitUps_list for _i in
pdf.pages[item].extract_table()]
womanSitUps_columnName_list = womanSitUps_comparisonTable_list[0][1:]
womanSitUps_columnName_list[0] = "0~24"
womanSitUps_comparisonTable_ar = np.array([item for item in womanSitUps_comparisonTable_list if
'25~27' not in item and '备注' not in item])
womanSitUps_rowName_ar = np.array(
[item.replace(' ', '') for item in womanSitUps_comparisonTable_ar[:, 0]]).astype(
int)
womanSitUps_data_ar = np.array([[int(_i.replace(' ', '')) for _i in item] for item in
womanSitUps_comparisonTable_ar[:, 1:]])
womanSitUps_df = pd.DataFrame(columns=womanSitUps_columnName_list,
index=womanSitUps_rowName_ar)
for index, col in enumerate(womanSitUps_columnName_list):
womanSitUps_df[col] = list(womanSitUps_data_ar[:, index])
womanSitUps_df.to_excel('womanSitUps.xlsx')
# 解析woman60mSerpentineRun
woman60mSerpentineRun_comparisonTable_list = [list(filter(None, _i)) for item in woman60mSerpentineRun_list for _i in
pdf.pages[item].extract_table()]
woman60mSerpentineRun_columnName_list = woman60mSerpentineRun_comparisonTable_list[0][1:]
woman60mSerpentineRun_columnName_list[0] = "0~24"
woman60mSerpentineRun_comparisonTable_ar = np.array([item for item in woman60mSerpentineRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
woman60mSerpentineRun_rowName_ar = np.array(
[item.replace(' ', '') for item in woman60mSerpentineRun_comparisonTable_ar[:, 0]]).astype(int)
woman60mSerpentineRun_data_ar = np.array(
[[int(_i.split("")[0]) + int(_i.split("")[1]) * 0.1 for _i in item] for item in
woman60mSerpentineRun_comparisonTable_ar[:, 1:]])
woman60mSerpentineRun_df = pd.DataFrame(columns=woman60mSerpentineRun_columnName_list,
index=woman60mSerpentineRun_rowName_ar)
for index, col in enumerate(woman60mSerpentineRun_columnName_list):
woman60mSerpentineRun_df[col] = list(woman60mSerpentineRun_data_ar[:, index])
woman60mSerpentineRun_df.to_excel('woman60mSerpentineRun.xlsx')
# 解析woman3000mRun
woman3000mRun_comparisonTable_list = [list(filter(None, _i)) for item in woman3000mRun_list for _i in
pdf.pages[item].extract_table()]
woman3000mRun_columnName_list = woman3000mRun_comparisonTable_list[0][1:]
woman3000mRun_columnName_list[0] = "0~24"
woman3000mRun_comparisonTable_ar = np.array([item for item in woman3000mRun_comparisonTable_list if
'25~27' not in item and '备注' not in item])
woman3000mRun_rowName_ar = np.array(
[item.replace(' ', '') for item in woman3000mRun_comparisonTable_ar[:, 0]]).astype(int)
woman3000mRun_data_ar = np.array([[int(_i.split("")[0]) * 60 + int(_i.split("")[1][:-1]) for _i in item] for item in
woman3000mRun_comparisonTable_ar[:, 1:]])
woman3000mRun_df = pd.DataFrame(columns=woman3000mRun_columnName_list, index=woman3000mRun_rowName_ar)
for index, col in enumerate(woman3000mRun_columnName_list):
woman3000mRun_df[col] = list(woman3000mRun_data_ar[:, index])
woman3000mRun_df.to_excel('woman3000mRun.xlsx')