>>> import pandas as pd >>> obj = pd.Series([1, 5, -8, 2], index=['a', 'b', 'c', 'd']) >>> obj a 1 b 5 c -8 d 2 dtype: int64 >>> >>> obj[1:3] b 5 c -8 dtype: int64 >>> >>> obj[0:3:2] a 1 c -8 dtype: int64 >>> >>> obj['b':'d'] b 5 c -8 d 2 dtype: int64
>>> import pandas as pd >>> obj = pd.Series([1, 5, -8, 2, -3], index=['a', 'b', 'c', 'd', 'e']) >>> obj a 1 b 5 c -8 d 2 e -3 dtype: int64 >>> >>> obj[obj > 0] a 1 b 5 d 2 dtype: int64 >>> >>> obj > 0 a True b True c False d True e False dtype: bool
>>> import pandas as pd >>> import numpy as np >>> obj = pd.DataFrame(np.random.randn(8,4), columns = ['a', 'b', 'c', 'd']) >>> obj a b c d 0-1.3993900.521596-0.8696130.506621 1-0.748562-0.3649520.188399-1.402566 21.378776-1.4764800.3616350.451134 3-0.206405-1.1886093.0025990.563650 40.9932891.1337481.177549-2.562286 5-0.4821571.0692931.143983-1.303079 6-1.1991540.2203600.801838-0.104533 7-1.359816-2.0920352.003530-0.151812 >>> >>> obj.head() a b c d 0-1.3993900.521596-0.8696130.506621 1-0.748562-0.3649520.188399-1.402566 21.378776-1.4764800.3616350.451134 3-0.206405-1.1886093.0025990.563650 40.9932891.1337481.177549-2.562286 >>> >>> obj.head(3) a b c d 0-1.3993900.521596-0.8696130.506621 1-0.748562-0.3649520.188399-1.402566 21.378776-1.4764800.3616350.451134 >>> >>> obj.tail() a b c d 3-0.206405-1.1886093.0025990.563650 40.9932891.1337481.177549-2.562286 5-0.4821571.0692931.143983-1.303079 6-1.1991540.2203600.801838-0.104533 7-1.359816-2.0920352.003530-0.151812 >>> >>> obj.tail(3) a b c d 5-0.4821571.0692931.143983-1.303079 6-1.1991540.2203600.801838-0.104533 7-1.359816-2.0920352.003530-0.151812
>>> import pandas as pd >>> import numpy as np >>> data = np.random.randn(5,4) >>> index = ['I1', 'I2', 'I3', 'I4', 'I5'] >>> columns = ['a', 'b', 'c', 'd'] >>> obj = pd.DataFrame(data, index, columns) >>> obj a b c d I1 0.828676-1.6633371.7536321.432487 I2 0.3681380.2221660.902764-1.436186 I3 2.285615-2.415175-1.344456-0.502214 I4 3.224288-0.5002681.293596-1.235549 I5 -0.938833-0.804433-0.170047-0.566766 >>> >>> obj[0:3] a b c d I1 0.828676-1.6633371.7536321.432487 I2 0.3681380.2221660.902764-1.436186 I3 2.285615-2.415175-1.344456-0.502214 >>> >>> obj[0:4:2] a b c d I1 -0.0421681.437354-1.1145450.830790 I3 0.2415060.018984-0.499151-1.190143 >>> >>> obj['I2':'I4'] a b c d I2 0.3681380.2221660.902764-1.436186 I3 2.285615-2.415175-1.344456-0.502214 I4 3.224288-0.5002681.293596-1.235549
【2.2.4】花式索引
和 Series 一样,所谓的花式索引,就是间隔索引、不连续的索引,传递一个由列名(columns)组成的列表来一次性获得多列元素:
>>> import pandas as pd >>> import numpy as np >>> data = np.random.randn(5,4) >>> index = ['I1', 'I2', 'I3', 'I4', 'I5'] >>> columns = ['a', 'b', 'c', 'd'] >>> obj = pd.DataFrame(data, index, columns) >>> obj a b c d I1 -0.602984-0.1357160.999689-0.339786 I2 0.911130-0.092485-0.914074-0.279588 I3 0.849606-0.420055-1.240389-0.179297 I4 0.249986-1.2506680.329416-1.105774 I5 -0.7438160.430647-0.058126-0.337319 >>> >>> obj[obj > 0] a b c d I1 NaN NaN 0.999689 NaN I2 0.911130 NaN NaN NaN I3 0.849606 NaN NaN NaN I4 0.249986 NaN 0.329416 NaN I5 NaN 0.430647 NaN NaN >>> >>> obj > 0 a b c d I1 FalseFalseTrueFalse I2 TrueFalseFalseFalse I3 TrueFalseFalseFalse I4 TrueFalseTrueFalse I5 FalseTrueFalseFalse
>>> import pandas as pd >>> obj = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'b', 'c'], columns=['A', 'B', 'C']) >>> obj A B C a 123 b 456 c 789 >>> >>> obj.loc['a'] A 1 B 2 C 3 Name: a, dtype: int64 >>> >>> obj.loc['a':'c'] A B C a 123 b 456 c 789 >>> >>> obj.loc[['a', 'c']] A B C a 123 c 789 >>> >>> obj.loc['b', 'B'] 5 >>> obj.loc['b', 'A':'C'] A 4 B 5 C 6 Name: b, dtype: int64
【3.2】iloc 位置索引
作用和 loc 一样,不过是基于索引的编号来索引,即根据 index 和 columns 的位置编号来选择数据。
>>> import pandas as pd >>> obj = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'b', 'c'], columns=['A', 'B', 'C']) >>> obj A B C a 123 b 456 c 789 >>> >>> obj.iloc[1] A 4 B 5 C 6 Name: b, dtype: int64 >>> >>> obj.iloc[0:2] A B C a 123 b 456 >>> >>> obj.iloc[[0, 2]] A B C a 123 c 789 >>> >>> obj.iloc[1, 2] 6 >>> >>> obj.iloc[1, 0:2] A 4 B 5 Name: b, dtype: int64
插值(填充)方式,取值如下: None:不填补空白; pad / ffill:将上一个有效的观测值向前传播到下一个有效的观测值; backfill / bfill:使用下一个有效观察值来填补空白; nearest:使用最近的有效观测值来填补空白。
fill_value
在重新索引的过程中,需要引入缺失值时使用的替代值
limit
前向或后向填充时的最大填充量
tolerance
向前或向后填充时,填充不准确匹配项的最大间距(绝对值距离)
level
在 Multilndex 的指定级别上匹配简单索引,否则选其子集
copy
默认为 True,无论如何都复制;如果为 False,则新旧相等就不复制
reindex 将会根据新索引进行重排。如果某个索引值当前不存在,就引入缺失值:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
>>> import pandas as pd >>> obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) >>> obj d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64 >>> >>> obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) >>> obj2 a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64
>>> import pandas as pd >>> import numpy as np >>> obj = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) >>> obj Ohio Texas California a 012 c 345 d 678 >>> >>> obj2 = obj.reindex(['a', 'b', 'c', 'd']) >>> obj2 Ohio Texas California a 0.01.02.0 b NaN NaN NaN c 3.04.05.0 d 6.07.08.0
列可以用 columns 关键字重新索引:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
>>> import pandas as pd >>> import numpy as np >>> obj = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) >>> obj Ohio Texas California a 012 c 345 d 678 >>> >>> states = ['Texas', 'Utah', 'California'] >>> obj.reindex(columns=states) Texas Utah California a 1 NaN 2 c 4 NaN 5 d 7 NaN 8