0.预处理

import pandas as pd
import numpy as np

excel = pd.read_excel('附件1.xlsx', 'Data', index_col=None, na_values=['NA'])

excel.columns

Index(['eventid', 'iyear', 'imonth', 'iday', 'approxdate', 'extended',
       'resolution', 'country', 'country_txt', 'region',
       ...
       'addnotes', 'scite1', 'scite2', 'scite3', 'dbsource', 'INT_LOG',
       'INT_IDEO', 'INT_MISC', 'INT_ANY', 'related'],
      dtype='object', length=135)

pd.DataFrame(excel[['eventid','nkill','nwound','property','propextent','propvalue','targtype1','targtype2','targtype3','country','region','resolution','iyear','imonth','iday']])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

eventid

nkill

nwound

property

propextent

propvalue

targtype1

targtype2

targtype3

country

region

resolution

iyear

imonth

iday

199801010001

104.0

6.0

-9

NaN

NaT

1998

199801010002

0.0

3.0

NaN

167

NaT

1998

199801010003

1.0

0.0

NaN

603

NaT

1998

199801020001

0.0

3.0

NaN

NaT

1998

199801020002

0.0

1.0

NaN

155

NaT

1998

...

114178

201712310022

1.0

2.0

-9

NaN

182

NaT

2017

114179

201712310029

2.0

7.0

4.0

-99.0

NaN

200

NaT

2017

114180

201712310030

0.0

4.0

-99.0

NaN

160

NaT

2017

114181

201712310031

0.0

-9

NaN

NaT

2017

114182

201712310032

0.0

NaN

160

NaT

2017

114183 rows × 15 columns

import time
from datetime import datetime,date
# 方法一,利用python内置转换
tmp_time = date(excel['iyear'][0], excel['imonth'][0], excel['iday'][0]) # 生成datetime格式的时间
time.mktime(tmp_time.timetuple()) # 转化为时间戳

883584000.0

result = []
# 方法二,转换位pd.Timestamp
for i in range(excel.shape[0]):
#     print(excel['iyear'][i], excel['imonth'][i], excel['iday'][i])
    if excel['iday'][i]==0:
        excel['iday'][i] = 1
    result.append(pd.Timestamp(datetime(excel['iyear'][0], excel['imonth'][0], excel['iday'][0])))
result = pd.Series(result) # 存储所有的起始时间
result.head()

<ipython-input-6-a8da161efb8a>:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel['iday'][i] = 1





0   1998-01-01
1   1998-01-01
2   1998-01-01
3   1998-01-01
4   1998-01-01
dtype: datetime64[ns]

end = []
for i in range(excel.shape[0]):
    if pd.notnull(excel['resolution'][i]): # 相反的是isnull(time)
        excel['resolution'][i]=excel['resolution'][i]-result[i]

<ipython-input-7-4bb3bc5e54ae>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel['resolution'][i]=excel['resolution'][i]-result[i]
/home/fxz/anaconda3/envs/notebook/lib/python3.8/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)

# help(excel['resolution'][828])
for i in range(excel.shape[0]):
    if pd.notnull(excel['resolution'][i]): 
        if excel['resolution'][i].value < 0: # 设置timedelta为负的值为NaT
            excel['resolution'][i] = pd.NaT

<ipython-input-8-bcbd64a0a015>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel['resolution'][i] = pd.NaT

excel.loc[69,'resolution'] # 使用这种方法可以避开SettingWithCopyWarning

Timedelta('26 days 00:00:00')

result = pd.DataFrame(excel[['nkill','nwound','propextent','propvalue','resolution']])

# resolution 表示持续时间,如果当天解决,设置为86400/2
for i in range(result.shape[0]):
    if pd.notnull(result['resolution'][i]): 
        result.iloc[i,4] = result.iloc[i,4].value # 将持续时间,换成时间戳
    else:
        result.iloc[i,4] = 43200

from sklearn.impute import SimpleImputer # 新建填充器来处理NaN
# 1. 创建Imputer器
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # 使用平均值填充的方法

# 2. 使用fit_transform()函数即可完成缺失值填充了
result1=imp.fit_transform(result)

array([[ 1.04000000e+02,  6.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       [ 0.00000000e+00,  3.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       [ 1.00000000e+00,  0.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  4.00000000e+00,
        -9.90000000e+01,  4.32000000e+04],
       [ 0.00000000e+00,  0.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04],
       [ 0.00000000e+00,  0.00000000e+00,  3.26518063e+00,
         3.75636192e+04,  4.32000000e+04]])

result1 = pd.DataFrame(result1) # 转换回DataFrame
result1.to_excel('result1.xlsx', sheet_name='Sheet1',index=False,header=False) # 去掉行和列的索引

result1 = pd.read_excel('result1.xlsx', sheet_name='Sheet1', header = None )
result_norm = (result1 - result1.min()) / (result1.max() - result1.min()) # 归一化
result_norm.to_excel('result2.xlsx', sheet_name='Sheet1',index=False,header=False) # 写入结果

最后更新于4年前

这有帮助吗？