查看“Pandas常用操作”的源代码




==准备工作==
import numpy as np
import pandas as pd

==切片==

a=pd.Series([2,3,6,1],index=[4,7,3,1])

a[1]#显式大于隐式

 1

a[2]

 6

===loc切片和iloc切片===
使用loc切片更明了，表示按自定义的索引切片

a.loc[1]#包含起点也包含终点

 1

使用iloc切片表示按默认的行号索引切片

a.iloc[0]

 2

a.iloc[0:2]#包含起点不包含终点

 <nowiki>a    2
1    3
dtype: int64</nowiki>

注意包含起点与终点

a.loc[4:3]

 <nowiki>4    2
7    3
3    6
dtype: int64</nowiki>

只包含起点

a.iloc[1:2]

 <nowiki>7    3
dtype: int64</nowiki>

b=pd.DataFrame(np.random.randn(6,3),index=range(2,8), columns=["a","b","c"])
b

 <nowiki>a 	b 	c
2 	-0.984031 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716</nowiki>

b[2:4]#数据框的切片隐式与显式索引容易混淆

 <nowiki>a 	b 	c
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529</nowiki>

b["a"]

 <nowiki>2   -0.984031
3    0.837882
4   -0.626863
5   -0.434401
6   -0.340527
7    0.982684
Name: a, dtype: float64</nowiki>

b.loc[2:4]

 <nowiki>a 	b 	c
2 	-0.984031 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867</nowiki>

b.iloc[2:4]

 <nowiki>a 	b 	c
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529</nowiki>

b.loc[2:3].a.loc[2]

 <nowiki>-0.98403138550212987</nowiki>

b.loc[2].a

 <nowiki>-0.98403138550212987</nowiki>


赋值

b.loc[2].a=6

b

 <nowiki>a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716</nowiki>

b.loc[2].a="aaa"#字符串赋值不进去的

b

	a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716

==过滤==

a

 <nowiki>a     2
1     3
c     6
b     1
a1    2
dtype: int64</nowiki>

a[a>2]

 <nowiki>1    3
c    6
dtype: int64</nowiki>

b

 <nowiki>a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716</nowiki>

? b[b.b>0]

	<nowiki>a 	b 	c
2 	6.000000 	2.002845 	-0.547842
5 	-0.434401 	1.155147 	0.351529
7 	0.982684 	1.290136 	-0.236716</nowiki>

b[b["a"]>0]

	a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
7 	0.982684 	1.290136 	-0.236716

b[(b.a>0)&(b.c>0)]

	a 	b 	c

b[(b.a>0)|(b.c>0)]

	a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
7 	0.982684 	1.290136 	-0.236716

b[(b.a>0)^(b.c>0)][["a","c"]]

	a 	c
2 	6.000000 	-0.547842
3 	0.837882 	-0.051079
4 	-0.626863 	0.069867
5 	-0.434401 	0.351529
7 	0.982684 	-0.236716


==迭代==

a

 <nowiki>4    2
7    3
3    6
1    1
dtype: int64</nowiki>

for i in a.iteritems():
    print(i)

 <nowiki>(4, 2)
(7, 3)
(3, 6)
(1, 1)</nowiki>

b

 <nowiki>a 	b 	c
2 	6.000000 	2.002845 	-0.547842
3 	0.837882 	-0.111692 	-0.051079
4 	-0.626863 	-0.084946 	0.069867
5 	-0.434401 	1.155147 	0.351529
6 	-0.340527 	-1.047685 	-0.689944
7 	0.982684 	1.290136 	-0.236716</nowiki>

for i,j in b.iteritems():#i列标，j为行元素
    print(i)
    print(j)

 <nowiki>a
2    6.000000
3    0.837882
4   -0.626863
5   -0.434401
6   -0.340527
7    0.982684
Name: a, dtype: float64
b
2    2.002845
3   -0.111692
4   -0.084946
5    1.155147
6   -1.047685
7    1.290136
Name: b, dtype: float64
c
2   -0.547842
3   -0.051079
4    0.069867
5    0.351529
6   -0.689944
7   -0.236716
Name: c, dtype: float64</nowiki>

for i,j in b.iterrows():
    print(i)
    print(j)

 <nowiki>2
a    6.000000
b    2.002845
c   -0.547842
Name: 2, dtype: float64
3
a    0.837882
b   -0.111692
c   -0.051079
Name: 3, dtype: float64
4
a   -0.626863
b   -0.084946
c    0.069867
Name: 4, dtype: float64
5
a   -0.434401
b    1.155147
c    0.351529
Name: 5, dtype: float64
6
a   -0.340527
b   -1.047685
c   -0.689944
Name: 6, dtype: float64
7
a    0.982684
b    1.290136
c   -0.236716
Name: 7, dtype: float64</nowiki>

==排序==

a1=pd.Series([2,3,6,1],index=["i","u","2","1"])#排序需要在同种类型的数据中进行（比如整数与字符就不能比较）

a1

 <nowiki>i    2
u    3
2    6
1    1
dtype: int64</nowiki>

a1.sort_values()

 <nowiki>1    1
i    2
u    3
2    6
dtype: int64</nowiki>

a1.sort_index()

 <nowiki>1    1
2    6
i    2
u    3
dtype: int64
</nowiki>

b1=pd.DataFrame(np.random.randn(4,3),index=[2,4,1,3], columns=["b","a","k"])

b1     

 <nowiki>b 	a 	k
2 	-0.586665 	-1.131802 	1.163450
4 	-1.020215 	0.219465 	-0.017945
1 	1.236446 	-0.108178 	-0.293046
3 	-0.549749 	0.897673 	0.679992
</nowiki>

b1.sort_index()

 <nowiki>b 	a 	k
1 	1.236446 	-0.108178 	-0.293046
2 	-0.586665 	-1.131802 	1.163450
3 	-0.549749 	0.897673 	0.679992
4 	-1.020215 	0.219465 	-0.017945</nowiki>

b1.sort_index(axis=1)#按列标排

 <nowiki>a 	b 	k
2 	-1.131802 	-0.586665 	1.163450
4 	0.219465 	-1.020215 	-0.017945
1 	-0.108178 	1.236446 	-0.293046
3 	0.897673 	-0.549749 	0.679992</nowiki>

b1.sort_values(by="k")[["k","b"]]#按某一列排

 <nowiki>k 	b
1 	-0.293046 	1.236446
4 	-0.017945 	-1.020215
3 	0.679992 	-0.549749
2 	1.163450 	-0.586665</nowiki>

len(b)

 6