0.预处理
import pandas as pd
import numpy as np
excel = pd.read_excel('附件1.xlsx', 'Data', index_col=None, na_values=['NA'])
excel.columns
Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
'resolution', 'country', 'country_txt', 'region',
...
'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
dtype='object', length=135)
pd.DataFrame(excel[['eventid','nkill','nwound','property','propextent','propvalue','targtype1','targtype2','targtype3','country','region','resolution','iyear','imonth','iday']])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
eventid
nkill
nwound
property
propextent
propvalue
targtype1
targtype2
targtype3
country
region
resolution
iyear
imonth
iday
0
199801010001
104.0
6.0
-9
NaN
NaN
4
NaN
NaN
34
11
NaT
1998
1
1
1
199801010002
0.0
3.0
0
NaN
NaN
19
NaN
NaN
167
9
NaT
1998
1
1
2
199801010003
1.0
0.0
0
NaN
NaN
14
NaN
NaN
603
8
NaT
1998
1
1
3
199801020001
0.0
0.0
1
3.0
NaN
7
NaN
NaN
95
10
NaT
1998
1
2
4
199801020002
0.0
1.0
0
NaN
NaN
14
NaN
NaN
155
10
NaT
1998
1
2
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
114178
201712310022
1.0
2.0
-9
NaN
NaN
4
NaN
NaN
182
11
NaT
2017
12
31
114179
201712310029
2.0
7.0
1
4.0
-99.0
4
NaN
NaN
200
10
NaT
2017
12
31
114180
201712310030
0.0
0.0
1
4.0
-99.0
14
NaN
NaN
160
5
NaT
2017
12
31
114181
201712310031
0.0
0.0
-9
NaN
NaN
2
NaN
NaN
92
6
NaT
2017
12
31
114182
201712310032
0.0
0.0
0
NaN
NaN
20
NaN
NaN
160
5
NaT
2017
12
31
114183 rows × 15 columns
import time
from datetime import datetime,date
# 方法一,利用python内置转换
tmp_time = date(excel['iyear'][0], excel['imonth'][0], excel['iday'][0]) # 生成datetime格式的时间
time.mktime(tmp_time.timetuple()) # 转化为时间戳
883584000.0
result = []
# 方法二,转换位pd.Timestamp
for i in range(excel.shape[0]):
# print(excel['iyear'][i], excel['imonth'][i], excel['iday'][i])
if excel['iday'][i]==0:
excel['iday'][i] = 1
result.append(pd.Timestamp(datetime(excel['iyear'][0], excel['imonth'][0], excel['iday'][0])))
result = pd.Series(result) # 存储所有的起始时间
result.head()
<ipython-input-6-a8da161efb8a>:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
excel['iday'][i] = 1
0 1998-01-01
1 1998-01-01
2 1998-01-01
3 1998-01-01
4 1998-01-01
dtype: datetime64[ns]
end = []
for i in range(excel.shape[0]):
if pd.notnull(excel['resolution'][i]): # 相反的是isnull(time)
excel['resolution'][i]=excel['resolution'][i]-result[i]
<ipython-input-7-4bb3bc5e54ae>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
excel['resolution'][i]=excel['resolution'][i]-result[i]
/home/fxz/anaconda3/envs/notebook/lib/python3.8/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
iloc._setitem_with_indexer(indexer, value)
# help(excel['resolution'][828])
for i in range(excel.shape[0]):
if pd.notnull(excel['resolution'][i]):
if excel['resolution'][i].value < 0: # 设置timedelta为负的值为NaT
excel['resolution'][i] = pd.NaT
<ipython-input-8-bcbd64a0a015>:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
excel['resolution'][i] = pd.NaT
excel.loc[69,'resolution'] # 使用这种方法可以避开SettingWithCopyWarning
Timedelta('26 days 00:00:00')
result = pd.DataFrame(excel[['nkill','nwound','propextent','propvalue','resolution']])
# resolution 表示持续时间,如果当天解决,设置为86400/2
for i in range(result.shape[0]):
if pd.notnull(result['resolution'][i]):
result.iloc[i,4] = result.iloc[i,4].value # 将持续时间,换成时间戳
else:
result.iloc[i,4] = 43200
from sklearn.impute import SimpleImputer # 新建填充器来处理NaN
# 1. 创建Imputer器
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # 使用平均值填充的方法
# 2. 使用fit_transform()函数即可完成缺失值填充了
result1=imp.fit_transform(result)
array([[ 1.04000000e+02, 6.00000000e+00, 3.26518063e+00,
3.75636192e+04, 4.32000000e+04],
[ 0.00000000e+00, 3.00000000e+00, 3.26518063e+00,
3.75636192e+04, 4.32000000e+04],
[ 1.00000000e+00, 0.00000000e+00, 3.26518063e+00,
3.75636192e+04, 4.32000000e+04],
...,
[ 0.00000000e+00, 0.00000000e+00, 4.00000000e+00,
-9.90000000e+01, 4.32000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 3.26518063e+00,
3.75636192e+04, 4.32000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 3.26518063e+00,
3.75636192e+04, 4.32000000e+04]])
result1 = pd.DataFrame(result1) # 转换回DataFrame
result1.to_excel('result1.xlsx', sheet_name='Sheet1',index=False,header=False) # 去掉行和列的索引
result1 = pd.read_excel('result1.xlsx', sheet_name='Sheet1', header = None )
result_norm = (result1 - result1.min()) / (result1.max() - result1.min()) # 归一化
result_norm.to_excel('result2.xlsx', sheet_name='Sheet1',index=False,header=False) # 写入结果
最后更新于
这有帮助吗?