Andy's blog

pandas简单使用
#%%

import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Series是一种类似与一维数组的对象，由下面两个部分组成：
# values：一组数据（ndarray类型）
# index：相关的数据索引标签

# Series这个数据结构中必须是一维
s1 = Series(data=[1,2,3,4])
s2 = Series(data=np.random.randint(0,100,size=(4,)))

dic = {
    'a':1,
    'b':2,
    'c':3
}
# Series 索引可以是字符串
s3 = Series(data=dic)
s3
"""
a    1
b    2
c    3
dtype: int64
"""
s4 = Series(data=[2,3,4], index=['数学','英语','理综'])
s4
"""
数学    2
英语    3
理综    4
dtype: int64
"""
# series 索引
# 隐式索引： 数值型
# 显式索引 自定义，提高数据可读性

# 索引和切片
s4[0] # 2
s4['数学'] # 2
s4.数学 # 2

#切片
s4[0:2]
"""
数学    2
英语    3
dtype: int64
"""

# Series常用属性
# shape
# size
# index
# values
s4.shape # (3,)
s4.size # 3
s4.index # Index(['数学', '英语', '理综'], dtype='object')
s4.values # array([2, 3, 4], dtype=int64)

#%%
# 常用方法
# head(), tail()
# unique()
# isnull(), notnull()
# add(), sub(), mul(),div()
s4.head(2) #显示前２条
s4.tail() #显示２条

s = Series(data=[1,2,3,2,1])
s.unique() # array([1, 2, 3], dtype=int64)

# 算术运算
# 索引相配的进行算术运算，否则补空 Nan
s + s
s.add(s)

s1 = Series(data=[1,2])
s2 = Series(data=[2,3,4])
s1 +s2
"""
0    3.0
1    5.0
2    NaN
"""

# 清洗Series中的值
s1 = Series(data=[1,2,3,4],index=['a','b','c','e'])
s2 = Series(data=[1,2,3,4],index=['a','d','c','f'])
s = s1 + s2
"""
a    2.0
b    NaN
c    6.0
d    NaN
e    NaN
f    NaN
dtype: float64
"""

# boolen 作为索引取值
s1 = Series(data=[1,2,3,4],index=['a','b','c','e'])
s1[[True,False, True, False]]
"""
a    1
c    3
dtype: int64
"""

s.isnull()
"""
a    False
b     True
c    False
d     True
e     True
f     True
dtype: bool
"""

s.notnull()
"""
a     True
b    False
c     True
d    False
e    False
f    False
dtype: bool
"""
# 清洗NaN
s[s.notnull()]
"""
a    2.0
c    6.0
dtype: float64
"""
#%%

# DataFrame
"""
DataFrame是一个【表格型】的数据结构。DataFrame由按一定顺序排列的多列数据组成。
设计初衷是将Series的使用场景从一维拓展到多维。DataFrame既有行索引，也有列索引。
行索引：index
列索引：columns
值：values
"""
# DataFrame创建
# ndarray创建
# 字典创建
df = DataFrame(data=np.random.randint(0,20,size=(3,4)))
df


dic = {
    'name':['andy', 'jack', 'mary'],
    'salary':[1000, 2000, 3000]
}
df = DataFrame(data=dic, index=['a', 'b','c'])
df

# DataFrame的属性
# values
# columns
# index
# shape

df.values
"""
array([['andy', 1000],
       ['jack', 2000],
       ['mary', 3000]], dtype=object)
"""
df.shape
# (3, 2)
df.index
# Index(['a', 'b', 'c'], dtype='object')

df.columns
# Index(['name', 'salary'], dtype='object')
#%%
# 索引
df[['salary', 'name']]

# 取单行
df.loc['a']
df.iloc[0]
"""
name      andy
salary    1000
Name: a, dtype: object
"""
# 取多行
df.loc[['a','c']]
df.iloc[[0,2]]

# 取单个元素
df.iloc[1,1] # 2000 
df.loc['b','salary'] # 2000

# 取多个元素
df.loc[['b','c'], 'salary']
df.iloc[[1,2],1]
"""
b    2000
c    3000
Name: salary, dtype: int64
"""
#切片
df 

# 切行
df[0:2]

# 切列
df.iloc[:,0:1]
#%%
# 时间数据类型的转换
# pd.to_datetime(col)
# 将某一列设置为行索引
# df.set_index()

dic = {
    'time':['2019-01-09', '2011-01-09', '2018-09-01'],
    'salary':[2222,3333,4444]
}

df = DataFrame(data=dic)
df

# 查看df数据类型
df.dtypes
"""
salary     int64
time      object
dtype: object
"""
# 将time列转换成时间序列类型
df['time'] = pd.to_datetime(df['time'])
df.dtypes
"""
salary             int64
time      datetime64[ns]
dtype: object
"""

# 将time这一列作为原数据的行索引
df.set_index(df['time'],inplace=True)
df.drop(labels='time', axis=1, inplace=True) # drop函数中axis的行，1为列
df
Pandas简单使用