0.预处理

import pandas as pd
import numpy as np
excel = pd.read_excel('附件1.xlsx', 'Data', index_col=None, na_values=['NA'])
excel.columns
Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=135)
pd.DataFrame(excel[['eventid','nkill','nwound','property','propextent','propvalue','targtype1','targtype2','targtype3','country','region','resolution','iyear','imonth','iday']])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

eventid

nkill

nwound

property

propextent

propvalue

targtype1

targtype2

targtype3

country

region

resolution

iyear

imonth

iday

0

199801010001

104.0

6.0

-9

NaN

NaN

4

NaN

NaN

34

11

NaT

1998

1

1

1

199801010002

0.0

3.0

0

NaN

NaN

19

NaN

NaN

167

9

NaT

1998

1

1

2

199801010003

1.0

0.0

0

NaN

NaN

14

NaN

NaN

603

8

NaT

1998

1

1

3

199801020001

0.0

0.0

1

3.0

NaN

7

NaN

NaN

95

10

NaT

1998

1

2

4

199801020002

0.0

1.0

0

NaN

NaN

14

NaN

NaN

155

10

NaT

1998

1

2

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

114178

201712310022

1.0

2.0

-9

NaN

NaN

4

NaN

NaN

182

11

NaT

2017

12

31

114179

201712310029

2.0

7.0

1

4.0

-99.0

4

NaN

NaN

200

10

NaT

2017

12

31

114180

201712310030

0.0

0.0

1

4.0

-99.0

14

NaN

NaN

160

5

NaT

2017

12

31

114181

201712310031

0.0

0.0

-9

NaN

NaN

2

NaN

NaN

92

6

NaT

2017

12

31

114182

201712310032

0.0

0.0

0

NaN

NaN

20

NaN

NaN

160

5

NaT

2017

12

31

114183 rows × 15 columns

import time
from datetime import datetime,date
# 方法一,利用python内置转换
tmp_time = date(excel['iyear'][0], excel['imonth'][0], excel['iday'][0]) # 生成datetime格式的时间
time.mktime(tmp_time.timetuple()) # 转化为时间戳
883584000.0
result = []
# 方法二,转换位pd.Timestamp
for i in range(excel.shape[0]):
#     print(excel['iyear'][i], excel['imonth'][i], excel['iday'][i])
    if excel['iday'][i]==0:
        excel['iday'][i] = 1
    result.append(pd.Timestamp(datetime(excel['iyear'][0], excel['imonth'][0], excel['iday'][0])))
result = pd.Series(result) # 存储所有的起始时间
result.head()
<ipython-input-6-a8da161efb8a>:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel['iday'][i] = 1





0   1998-01-01
1   1998-01-01
2   1998-01-01
3   1998-01-01
4   1998-01-01
dtype: datetime64[ns]
end = []
for i in range(excel.shape[0]):
    if pd.notnull(excel['resolution'][i]): # 相反的是isnull(time)
        excel['resolution'][i]=excel['resolution'][i]-result[i]
<ipython-input-7-4bb3bc5e54ae>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel['resolution'][i]=excel['resolution'][i]-result[i]
/home/fxz/anaconda3/envs/notebook/lib/python3.8/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
# help(excel['resolution'][828])
for i in range(excel.shape[0]):
    if pd.notnull(excel['resolution'][i]): 
        if excel['resolution'][i].value < 0: # 设置timedelta为负的值为NaT
            excel['resolution'][i] = pd.NaT
<ipython-input-8-bcbd64a0a015>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel['resolution'][i] = pd.NaT
excel.loc[69,'resolution'] # 使用这种方法可以避开SettingWithCopyWarning
Timedelta('26 days 00:00:00')
result = pd.DataFrame(excel[['nkill','nwound','propextent','propvalue','resolution']])
# resolution 表示持续时间,如果当天解决,设置为86400/2
for i in range(result.shape[0]):
    if pd.notnull(result['resolution'][i]): 
        result.iloc[i,4] = result.iloc[i,4].value # 将持续时间,换成时间戳
    else:
        result.iloc[i,4] = 43200
from sklearn.impute import SimpleImputer # 新建填充器来处理NaN
# 1. 创建Imputer器
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # 使用平均值填充的方法

# 2. 使用fit_transform()函数即可完成缺失值填充了
result1=imp.fit_transform(result)
array([[ 1.04000000e+02,  6.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       [ 0.00000000e+00,  3.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       [ 1.00000000e+00,  0.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  4.00000000e+00,
        -9.90000000e+01,  4.32000000e+04],
       [ 0.00000000e+00,  0.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       [ 0.00000000e+00,  0.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04]])
result1 = pd.DataFrame(result1) # 转换回DataFrame
result1.to_excel('result1.xlsx', sheet_name='Sheet1',index=False,header=False) # 去掉行和列的索引
result1 = pd.read_excel('result1.xlsx', sheet_name='Sheet1', header = None )
result_norm = (result1 - result1.min()) / (result1.max() - result1.min()) # 归一化
result_norm.to_excel('result2.xlsx', sheet_name='Sheet1',index=False,header=False) # 写入结果

最后更新于

这有帮助吗?