Pandas常用操作

来自CloudWiki
跳转至: 导航搜索


准备工作

import numpy as np import pandas as pd

切片

a=pd.Series([2,3,6,1],index=[4,7,3,1])

a[1]#显式大于隐式

1

a[2]

6

loc切片和iloc切片

使用loc切片更明了,表示按自定义的索引切片

a.loc[1]#包含起点也包含终点

1

使用iloc切片表示按默认的行号索引切片

a.iloc[0]

2

a.iloc[0:2]#包含起点不包含终点

a    2
1    3
dtype: int64

注意包含起点与终点

a.loc[4:3]

4    2
7    3
3    6
dtype: int64

只包含起点

a.iloc[1:2]

7    3
dtype: int64

b=pd.DataFrame(np.random.randn(6,3),index=range(2,8), columns=["a","b","c"]) b

a 	b 	c
2 	-0.984031 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716

b[2:4]#数据框的切片隐式与显式索引容易混淆

a 	b 	c
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529

b["a"]

2   -0.984031
3    0.837882
4   -0.626863
5   -0.434401
6   -0.340527
7    0.982684
Name: a, dtype: float64

b.loc[2:4]

a 	b 	c
2 	-0.984031 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867

b.iloc[2:4]

a 	b 	c
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529

b.loc[2:3].a.loc[2]

-0.98403138550212987

b.loc[2].a

-0.98403138550212987


赋值

b.loc[2].a=6

b

a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716

b.loc[2].a="aaa"#字符串赋值不进去的

b

a b c 2 6.000000 2.002845 -0.547842 3 0.837882 -0.111692 -0.051079 4 -0.626863 -0.084946 0.069867 5 -0.434401 1.155147 0.351529 6 -0.340527 -1.047685 -0.689944 7 0.982684 1.290136 -0.236716

过滤

a

a     2
1     3
c     6
b     1
a1    2
dtype: int64

a[a>2]

1    3
c    6
dtype: int64

b

a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716

? b[b.b>0]

a b c 2 6.000000 2.002845 -0.547842 5 -0.434401 1.155147 0.351529 7 0.982684 1.290136 -0.236716

b[b["a"]>0]

a b c 2 6.000000 2.002845 -0.547842 3 0.837882 -0.111692 -0.051079 7 0.982684 1.290136 -0.236716

b[(b.a>0)&(b.c>0)]

a b c

b[(b.a>0)|(b.c>0)]

a b c 2 6.000000 2.002845 -0.547842 3 0.837882 -0.111692 -0.051079 4 -0.626863 -0.084946 0.069867 5 -0.434401 1.155147 0.351529 7 0.982684 1.290136 -0.236716

b[(b.a>0)^(b.c>0)]"a","c"

a c 2 6.000000 -0.547842 3 0.837882 -0.051079 4 -0.626863 0.069867 5 -0.434401 0.351529 7 0.982684 -0.236716


迭代

a

4    2
7    3
3    6
1    1
dtype: int64

for i in a.iteritems():

   print(i)
(4, 2)
(7, 3)
(3, 6)
(1, 1)

b

a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716

for i,j in b.iteritems():#i列标,j为行元素

   print(i)
   print(j)
a
2    6.000000
3    0.837882
4   -0.626863
5   -0.434401
6   -0.340527
7    0.982684
Name: a, dtype: float64
b
2    2.002845
3   -0.111692
4   -0.084946
5    1.155147
6   -1.047685
7    1.290136
Name: b, dtype: float64
c
2   -0.547842
3   -0.051079
4    0.069867
5    0.351529
6   -0.689944
7   -0.236716
Name: c, dtype: float64

for i,j in b.iterrows():

   print(i)
   print(j)
2
a    6.000000
b    2.002845
c   -0.547842
Name: 2, dtype: float64
3
a    0.837882
b   -0.111692
c   -0.051079
Name: 3, dtype: float64
4
a   -0.626863
b   -0.084946
c    0.069867
Name: 4, dtype: float64
5
a   -0.434401
b    1.155147
c    0.351529
Name: 5, dtype: float64
6
a   -0.340527
b   -1.047685
c   -0.689944
Name: 6, dtype: float64
7
a    0.982684
b    1.290136
c   -0.236716
Name: 7, dtype: float64

排序

a1=pd.Series([2,3,6,1],index=["i","u","2","1"])#排序需要在同种类型的数据中进行(比如整数与字符就不能比较)

a1

i    2
u    3
2    6
1    1
dtype: int64

a1.sort_values()

1    1
i    2
u    3
2    6
dtype: int64

a1.sort_index()

1    1
2    6
i    2
u    3
dtype: int64

b1=pd.DataFrame(np.random.randn(4,3),index=[2,4,1,3], columns=["b","a","k"])

b1

b 	a 	k
2 	-0.586665 	-1.131802 	1.163450
4 	-1.020215 	0.219465 	-0.017945
1 	1.236446 	-0.108178 	-0.293046
3 	-0.549749 	0.897673 	0.679992

b1.sort_index()

b 	a 	k
1 	1.236446 	-0.108178 	-0.293046
2 	-0.586665 	-1.131802 	1.163450
3 	-0.549749 	0.897673 	0.679992
4 	-1.020215 	0.219465 	-0.017945

b1.sort_index(axis=1)#按列标排

a 	b 	k
2 	-1.131802 	-0.586665 	1.163450
4 	0.219465 	-1.020215 	-0.017945
1 	-0.108178 	1.236446 	-0.293046
3 	0.897673 	-0.549749 	0.679992

b1.sort_values(by="k")"k","b"#按某一列排

k 	b
1 	-0.293046 	1.236446
4 	-0.017945 	-1.020215
3 	0.679992 	-0.549749
2 	1.163450 	-0.586665

len(b)

6