>>> import pandas as pd >>> import numpy as np >>> data = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randn(8), 'data2': np.random.randn(8)} >>> >>> obj = pd.DataFrame(data) >>> obj key1 key2 data1 data2 0 a one -0.804160-0.868905 1 b one -0.0869900.325741 2 a two 0.7579920.541101 3 b three -0.2814350.097841 4 a two 0.817757-0.643699 5 b two -0.462760-0.321196 6 a one -0.4036990.602138 7 a three 0.883940-0.850526 >>> >>> obj.groupby('key1') <pandas.core.groupby.generic.DataFrameGroupBy object at 0x03CDB7C0> >>> >>> obj['data1'].groupby(obj['key1']) <pandas.core.groupby.generic.SeriesGroupBy object at 0x03CDB748>
>>> import pandas as pd >>> import numpy as np >>> data = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randn(8), 'data2': np.random.randn(8)} >>> >>> obj = pd.DataFrame(data) >>> obj key1 key2 data1 data2 0 a one -0.544099-0.614079 1 b one 2.1937120.101005 2 a two -0.0046830.882770 3 b three 0.3128581.732105 4 a two 0.0110890.089587 5 b two 0.2921651.327638 6 a one -1.433291-0.238971 7 a three -0.004724-2.117326 >>> >>> grouped1 = obj.groupby('key1') >>> grouped2 = obj['data1'].groupby(obj['key1']) >>> >>> grouped1.mean() data1 data2 key1 a -0.395142-0.399604 b 0.9329121.053583 >>> >>> grouped2.mean() key1 a -0.395142 b 0.932912 Name: data1, dtype: float64 >>> >>> grouped1.size() key1 a 5 b 3 dtype: int64 >>> >>> grouped2.size() key1 a 5 b 3 Name: data1, dtype: int64
>>> import pandas as pd >>> import numpy as np >>> data = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randn(8), 'data2': np.random.randn(8)} >>> obj = pd.DataFrame(data) >>> obj key1 key2 data1 data2 0 a one -0.8416520.688055 1 b one 0.510042-0.561171 2 a two -0.418862-0.145983 3 b three -1.1046980.563158 4 a two 0.329527-0.893108 5 b two 0.753653-0.342520 6 a one -0.882527-1.121329 7 a three 1.7267940.160244 >>> >>> means = obj['data1'].groupby([obj['key1'], obj['key2']]).mean() >>> means key1 key2 a one -0.862090 three 1.726794 two -0.044667 b one 0.510042 three -1.104698 two 0.753653 Name: data1, dtype: float64 >>> >>> means.unstack() key2 one three two key1 a -0.8620901.726794-0.044667 b 0.510042-1.1046980.753653
>>> import pandas as pd >>> import numpy as np >>> obj = pd.DataFrame(np.random.randint(1, 10, (5,5)), columns=['a', 'b', 'c', 'd', 'e'], index=['A', 'B', 'C', 'D', 'E']) >>> obj a b c d e A 14719 B 82478 C 98251 D 24283 E 75723 >>> >>> obj_dict = {'a':'Python', 'b':'Python', 'c':'Java', 'd':'C++', 'e':'Java'} >>> obj.groupby(obj_dict, axis=1).size() C++ 1 Java 2 Python 2 dtype: int64 >>> >>> obj.groupby(obj_dict, axis=1).count() C++ Java Python A 122 B 122 C 122 D 122 E 122 >>> >>> obj.groupby(obj_dict, axis=1).sum() C++ Java Python A 1165 B 71210 C 5317 D 856 E 21012
>>> import pandas as pd >>> import numpy as np >>> obj = pd.DataFrame(np.random.randint(1, 10, (5,5)), columns=['a', 'b', 'c', 'd', 'e'], index=['AA', 'BBB', 'CC', 'D', 'EE']) >>> obj a b c d e AA 39582 BBB 14226 CC 92476 D 25571 EE 88822 >>> >>> defgroup_key(idx): """ idx 为列索引或行索引 """ return len(idx)
>>> import pandas as pd >>> import numpy as np >>> data = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randn(8), 'data2': np.random.randn(8)} >>> obj = pd.DataFrame(data) >>> obj key1 key2 data1 data2 0 a one -1.0887620.668504 1 b one 0.2755000.787844 2 a two -0.108417-0.491296 3 b three 0.019524-0.363390 4 a two 0.4536120.796999 5 b two 1.9828581.501877 6 a one 1.101132-1.928362 7 a three 0.524775-1.205842 >>> >>> for group_name, group_data in obj.groupby('key1'): print(group_name) print(group_data)
a key1 key2 data1 data2 0 a one -1.0887620.668504 2 a two -0.108417-0.491296 4 a two 0.4536120.796999 6 a one 1.101132-1.928362 7 a three 0.524775-1.205842 b key1 key2 data1 data2 1 b one 0.2755000.787844 3 b three 0.019524-0.363390 5 b two 1.9828581.501877
>>> import pandas as pd >>> import numpy as np >>> data = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randn(8), 'data2': np.random.randn(8)} >>> obj = pd.DataFrame(data) >>> obj key1 key2 data1 data2 0 a one -1.0887620.668504 1 b one 0.2755000.787844 2 a two -0.108417-0.491296 3 b three 0.019524-0.363390 4 a two 0.4536120.796999 5 b two 1.9828581.501877 6 a one 1.101132-1.928362 7 a three 0.524775-1.205842 >>> >>> for group_name, group_data in obj.groupby(['key1', 'key2']): print(group_name) print(group_data)
('a', 'one') key1 key2 data1 data2 0 a one -1.0887620.668504 6 a one 1.101132-1.928362 ('a', 'three') key1 key2 data1 data2 7 a three 0.524775-1.205842 ('a', 'two') key1 key2 data1 data2 2 a two -0.108417-0.491296 4 a two 0.4536120.796999 ('b', 'one') key1 key2 data1 data2 1 b one 0.27550.787844 ('b', 'three') key1 key2 data1 data2 3 b three 0.019524-0.36339 ('b', 'two') key1 key2 data1 data2 5 b two 1.9828581.501877
>>> import pandas as pd >>> import numpy as np >>> data = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randn(8), 'data2': np.random.randn(8)} >>> obj = pd.DataFrame(data) >>> obj key1 key2 data1 data2 0 a one -0.6070091.948301 1 b one 0.150818-0.025095 2 a two -2.0860240.358164 3 b three 0.4460611.708797 4 a two 0.745457-0.980948 5 b two 0.9818772.159327 6 a one 0.804480-0.499661 7 a three 0.1128840.004367 >>> >>> grouped = obj.groupby('key1') >>> list(grouped) [('a', key1 key2 data1 data2 0 a one -0.6070091.948301 2 a two -2.0860240.358164 4 a two 0.745457-0.980948 6 a one 0.804480-0.499661 7 a three 0.1128840.004367), ('b', key1 key2 data1 data2 1 b one 0.150818-0.025095 3 b three 0.4460611.708797 5 b two 0.9818772.159327)] >>> >>> dict(list(grouped)) {'a': key1 key2 data1 data2 0 a one -0.6070091.948301 2 a two -2.0860240.358164 4 a two 0.745457-0.980948 6 a one 0.804480-0.499661 7 a three 0.1128840.004367, 'b': key1 key2 data1 data2 1 b one 0.150818-0.025095 3 b three 0.4460611.708797 5 b two 0.9818772.159327}
【04x00】GroupBy Apply 数据应用
聚合指的是任何能够从数组产生标量值的数据转换过程,常用于对分组后的数据进行计算
【04x01】聚合函数
之前的例子已经用过一些内置的聚合函数,比如 mean、count、min 以及 sum 等。常见的聚合运算如下表所示:
>>> import pandas as pd >>> import numpy as np >>> obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randint(1,10, 8), 'data2': np.random.randint(1,10, 8)} >>> obj = pd.DataFrame(obj) >>> obj key1 key2 data1 data2 0 a one 97 1 b one 59 2 a two 24 3 b three 34 4 a two 51 5 b two 59 6 a one 18 7 a three 24 >>> >>> obj.groupby('key1').sum() data1 data2 key1 a 1924 b 1322 >>> >>> obj.groupby('key1').max() key2 data1 data2 key1 a two 98 b two 59 >>> >>> obj.groupby('key1').min() key2 data1 data2 key1 a one 11 b one 34 >>> >>> obj.groupby('key1').mean() data1 data2 key1 a 3.8000004.800000 b 4.3333337.333333 >>> >>> obj.groupby('key1').size() key1 a 5 b 3 dtype: int64 >>> >>> obj.groupby('key1').count() key2 data1 data2 key1 a 555 b 333 >>> >>> obj.groupby('key1').describe() data1 ... data2 count mean std min 25% ... min 25% 50% 75% max key1 ... a 5.03.8000003.2710851.02.0 ... 1.04.04.07.08.0 b 3.04.3333331.1547013.04.0 ... 4.06.59.09.09.0
>>> import pandas as pd >>> import numpy as np >>> obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randint(1,10, 8), 'data2': np.random.randint(1,10, 8)} >>> obj = pd.DataFrame(obj) >>> obj key1 key2 data1 data2 0 a one 97 1 b one 59 2 a two 24 3 b three 34 4 a two 51 5 b two 59 6 a one 18 7 a three 24 >>> >>> defpeak_range(df): return df.max() - df.min()
>>> >>> obj.groupby('key1').agg(peak_range) data1 data2 key1 a 87 b 25 >>> >>> obj.groupby('key1').agg(lambda df : df.max() - df.min()) data1 data2 key1 a 87 b 25
>>> import pandas as pd >>> import numpy as np >>> obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randint(1,10, 8), 'data2': np.random.randint(1,10, 8)} >>> obj = pd.DataFrame(obj) >>> obj key1 key2 data1 data2 0 a one 97 1 b one 59 2 a two 24 3 b three 34 4 a two 51 5 b two 59 6 a one 18 7 a three 24 >>> >>> dict1 = {'data1':'mean', 'data2':'sum'} >>> dict2 = {'data1':['mean','max'], 'data2':'sum'} >>> >>> obj.groupby('key1').agg(dict1) data1 data2 key1 a 3.80000024 b 4.33333322 >>> >>> obj.groupby('key1').agg(dict2) data1 data2 mean max sum key1 a 3.800000924 b 4.333333522
>>> import pandas as pd >>> obj = pd.DataFrame({'A':['bob','sos','bob','sos','bob','sos','bob','bob'], 'B':['one','one','two','three','two','two','one','three'], 'C':[3,1,4,1,5,9,2,6], 'D':[1,2,3,4,5,6,7,8]}) >>> obj A B C D 0 bob one 31 1 sos one 12 2 bob two 43 3 sos three 14 4 bob two 55 5 sos two 96 6 bob one 27 7 bob three 68 >>> >>> grouped = obj.groupby('A') >>> for name, group in grouped: print(name) print(group)
bob A B C D 0 bob one 31 2 bob two 43 4 bob two 55 6 bob one 27 7 bob three 68 sos A B C D 1 sos one 12 3 sos three 14 5 sos two 96 >>> >>> grouped.apply(lambda x:x.describe()) # 对 bob 和 sos 两组数据使用 describe 方法 C D A bob count 5.0000005.000000 mean 4.0000004.800000 std 1.5811392.863564 min 2.0000001.000000 25% 3.0000003.000000 50% 4.0000005.000000 75% 5.0000007.000000 max 6.0000008.000000 sos count 3.0000003.000000 mean 3.6666674.000000 std 4.6188022.000000 min 1.0000002.000000 25% 1.0000003.000000 50% 1.0000004.000000 75% 5.0000005.000000 max 9.0000006.000000 >>> >>> grouped.apply(lambda x:x.min()) # # 对 bob 和 sos 两组数据使用 min 方法 A B C D A bob bob one 21 sos sos one 12