所有 Notebook 的源文件都可以在 GitHub 上访问到呀~

Group by

In [37]:
import pandas as pd
import numpy as np
In [38]:
#选取df中SUMLEV值为50的部分
df = pd.read_csv("census.csv")
df = df[df["SUMLEV"] == 50]
df
Out[38]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
5 50 3 6 1 9 Alabama Blount County 57322 57322 57373 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3188 50 4 8 56 37 Wyoming Sweetwater County 43806 43806 43593 ... 1.072643 16.243199 -5.339774 -14.252889 -14.248864 1.255221 16.243199 -5.295460 -14.075283 -14.070195
3189 50 4 8 56 39 Wyoming Teton County 21294 21294 21297 ... -1.589565 0.972695 19.525929 14.143021 -0.564849 0.654527 2.408578 21.160658 16.308671 1.520747
3190 50 4 8 56 41 Wyoming Uinta County 21118 21118 21102 ... -17.755986 -4.916350 -6.902954 -14.215862 -12.127022 -18.136812 -5.536861 -7.521840 -14.740608 -12.606351
3191 50 4 8 56 43 Wyoming Washakie County 8533 8533 8545 ... -11.637475 -0.827815 -2.013502 -17.781491 1.682288 -11.990126 -1.182592 -2.250385 -18.020168 1.441961
3192 50 4 8 56 45 Wyoming Weston County 7208 7208 7181 ... -11.752361 -8.040059 12.372583 1.533635 6.935294 -12.032179 -8.040059 12.372583 1.533635 6.935294

3142 rows × 100 columns

In [39]:
"""
%%timeit -n 10
for state in df["STNAME"].unique():
    avg = np.average(df.where(df["STNAME"] == state).dropna()["CENSUS2010POP"])
    print("Counties in state" + state + "have an average population of" + str(avg))
"""
Out[39]:
'\n%%timeit -n 10\nfor state in df["STNAME"].unique():\n    avg = np.average(df.where(df["STNAME"] == state).dropna()["CENSUS2010POP"])\n    print("Counties in state" + state + "have an average population of" + str(avg))\n'
In [40]:
"""
%%timeit -n 10
for group, frame in df.groupby("STNAME"):
    avg = np.average(frame["CENSUS2010POP"])
    print("Counties in state" + group + "have an average population of" + str(avg))
    """
Out[40]:
'\n%%timeit -n 10\nfor group, frame in df.groupby("STNAME"):\n    avg = np.average(frame["CENSUS2010POP"])\n    print("Counties in state" + group + "have an average population of" + str(avg))\n    '
In [41]:
df.head()
Out[41]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
5 50 3 6 1 9 Alabama Blount County 57322 57322 57373 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411

5 rows × 100 columns

In [42]:
#将 STNAME 设为 index
df = df.set_index("STNAME")
#定义一个函数,首字母 <M 返回 0,首字母 >Q 返回 1,其他返回2
def fun(item):
    if item[0] < "M":
        return 0
    if item[0] > "Q":
        return 1
    return 2
#groupby(by)参数by用于决定分组,如果by是一个函数,那么就会对每个对象的 index进行分组
#将 index 即 STNAME 按照首字母进行分类,分为三类
for group, frame in df.groupby(fun):
    print("There are " + str(len(frame)) + " records in group " + str(group) + " for processing.")
There are 1177 records in group 0 for processing.
There are 831 records in group 1 for processing.
There are 1134 records in group 2 for processing.
In [43]:
df = pd.read_csv("census.csv")
df = df[df["SUMLEV"] == 50]
In [44]:
#groupby(by)如果by是一个lable(行名/列名),那么久就会对columns in self进行分组
#agg(func),func用于聚合数据,func可以为函数、str、list、dict
#agg()传入dict,包含感兴趣的column name,即CENSUS2010POP,以及想要apply的function,即np.average
#按照“STNAME”进行分组,对组内聚合求CENSUS2010POP平均值
df.groupby("STNAME").agg({"CENSUS2010POP" : np.average})
Out[44]:
CENSUS2010POP
STNAME
Alabama 71339.343284
Alaska 24490.724138
Arizona 426134.466667
Arkansas 38878.906667
California 642309.586207
Colorado 78581.187500
Connecticut 446762.125000
Delaware 299311.333333
District of Columbia 601723.000000
Florida 280616.567164
Georgia 60928.635220
Hawaii 272060.200000
Idaho 35626.863636
Illinois 125790.509804
Indiana 70476.108696
Iowa 30771.262626
Kansas 27172.552381
Kentucky 36161.391667
Louisiana 70833.937500
Maine 83022.562500
Maryland 240564.666667
Massachusetts 467687.785714
Michigan 119080.000000
Minnesota 60964.655172
Mississippi 36186.548780
Missouri 52077.626087
Montana 17668.125000
Nebraska 19638.075269
Nevada 158855.941176
New Hampshire 131647.000000
New Jersey 418661.619048
New Mexico 62399.363636
New York 312550.032258
North Carolina 95354.830000
North Dakota 12690.396226
Ohio 131096.636364
Oklahoma 48718.844156
Oregon 106418.722222
Pennsylvania 189587.746269
Rhode Island 210513.400000
South Carolina 100551.391304
South Dakota 12336.060606
Tennessee 66801.105263
Texas 98998.271654
Utah 95306.379310
Vermont 44695.785714
Virginia 60111.293233
Washington 172424.102564
West Virginia 33690.800000
Wisconsin 78985.916667
Wyoming 24505.478261
In [45]:
#dataframe groupby
print(type(df.groupby(level=0)["POPESTIMATE2010","POPESTIMATE2011"]))
#Series groupby
print(type(df.groupby(level=0)["POPESTIMATE2010"]))
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
<class 'pandas.core.groupby.generic.SeriesGroupBy'>
<ipython-input-45-0aed87c54ccf>:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  print(type(df.groupby(level=0)["POPESTIMATE2010","POPESTIMATE2011"]))
In [57]:
#dataframe和series的aggregate是有一些差异的
#首先,将df数据转为Series,index为STNAME,column为CENSUS2010POP
#接着,使用level参数将其按照index分组
#然后调用agg方法,其中dict中既有 np.average 函数又有 np.sum 函数
#Pandas对Series对象应用这些函数,由于只有一个column,而对其应用两个函数,所以显示sum、avg
#groupby(level=None),层次参数level可以是int/level name/sequence of such/None
#当分组对象是multiIndex的时候(index是一个多维的),level=0就是按照第一分层进行分组
(df.set_index('STNAME').groupby(level = 0)['CENSUS2010POP'].agg([('avg', np.average), ('sum', np.sum)]))
Out[57]:
avg sum
STNAME
Alabama 71339.343284 4779736
Alaska 24490.724138 710231
Arizona 426134.466667 6392017
Arkansas 38878.906667 2915918
California 642309.586207 37253956
Colorado 78581.187500 5029196
Connecticut 446762.125000 3574097
Delaware 299311.333333 897934
District of Columbia 601723.000000 601723
Florida 280616.567164 18801310
Georgia 60928.635220 9687653
Hawaii 272060.200000 1360301
Idaho 35626.863636 1567582
Illinois 125790.509804 12830632
Indiana 70476.108696 6483802
Iowa 30771.262626 3046355
Kansas 27172.552381 2853118
Kentucky 36161.391667 4339367
Louisiana 70833.937500 4533372
Maine 83022.562500 1328361
Maryland 240564.666667 5773552
Massachusetts 467687.785714 6547629
Michigan 119080.000000 9883640
Minnesota 60964.655172 5303925
Mississippi 36186.548780 2967297
Missouri 52077.626087 5988927
Montana 17668.125000 989415
Nebraska 19638.075269 1826341
Nevada 158855.941176 2700551
New Hampshire 131647.000000 1316470
New Jersey 418661.619048 8791894
New Mexico 62399.363636 2059179
New York 312550.032258 19378102
North Carolina 95354.830000 9535483
North Dakota 12690.396226 672591
Ohio 131096.636364 11536504
Oklahoma 48718.844156 3751351
Oregon 106418.722222 3831074
Pennsylvania 189587.746269 12702379
Rhode Island 210513.400000 1052567
South Carolina 100551.391304 4625364
South Dakota 12336.060606 814180
Tennessee 66801.105263 6346105
Texas 98998.271654 25145561
Utah 95306.379310 2763885
Vermont 44695.785714 625741
Virginia 60111.293233 7994802
Washington 172424.102564 6724540
West Virginia 33690.800000 1852994
Wisconsin 78985.916667 5686986
Wyoming 24505.478261 563626

Test

Looking at our backpacking equipment DataFrame, suppose we are interested in finding our total weight for each category. Use groupby to group the dataframe, and apply a function to calculate the total weight (Weight x Quantity) by category

print(df.groupby("Category").apply(lambda df, a, b : sum(df[a] * df[b]),"Weight(oz.)", "Quantity"))

In [62]:
(df.set_index("STNAME").groupby(level = 0)["POPESTIMATE2010", "POPESTIMATE2011"]
.agg([("avg" , np.average), ("sum" , np.sum)]))
<ipython-input-62-a0ebb5d92584>:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  (df.set_index("STNAME").groupby(level = 0)["POPESTIMATE2010", "POPESTIMATE2011"]
Out[62]:
POPESTIMATE2010 POPESTIMATE2011
avg sum avg sum
STNAME
Alabama 71420.313433 4785161 71658.328358 4801108
Alaska 24621.413793 714021 24921.379310 722720
Arizona 427213.866667 6408208 431248.800000 6468732
Arkansas 38965.253333 2922394 39180.506667 2938538
California 643691.017241 37334079 650000.586207 37700034
Colorado 78878.968750 5048254 79991.875000 5119480
Connecticut 447464.625000 3579717 448719.875000 3589759
Delaware 299930.333333 899791 302638.666667 907916
District of Columbia 605126.000000 605126 620472.000000 620472
Florida 281341.641791 18849890 285157.208955 19105533
Georgia 61090.905660 9713454 61712.452830 9812280
Hawaii 272796.000000 1363980 275645.400000 1378227
Idaho 35704.227273 1570986 36003.045455 1584134
Illinois 125894.598039 12841249 126096.882353 12861882
Indiana 70549.891304 6490590 70835.271739 6516845
Iowa 30815.090909 3050694 30963.525253 3065389
Kansas 27226.895238 2858824 27332.542857 2869917
Kentucky 36232.808333 4347937 36399.016667 4367882
Louisiana 71014.859375 4544951 71490.328125 4575381
Maine 82980.937500 1327695 83016.062500 1328257
Maryland 241183.708333 5788409 243507.125000 5844171
Massachusetts 468931.142857 6565036 472271.214286 6611797
Michigan 119004.445783 9877369 118995.048193 9876589
Minnesota 61044.862069 5310903 61472.632184 5348119
Mississippi 36223.365854 2970316 36317.060976 2977999
Missouri 52139.582609 5996052 52265.973913 6010587
Montana 17690.053571 990643 17816.892857 997746
Nebraska 19677.688172 1830025 19810.569892 1842383
Nevada 159025.882353 2703440 159930.529412 2718819
New Hampshire 131670.800000 1316708 131834.400000 1318344
New Jersey 419232.428571 8803881 421092.095238 8842934
New Mexico 62567.909091 2064741 62976.545455 2078226
New York 312950.322581 19402920 314890.354839 19523202
North Carolina 95589.790000 9558979 96510.250000 9651025
North Dakota 12726.981132 674530 12930.679245 685326
Ohio 131145.068182 11540766 131198.204545 11545442
Oklahoma 48825.922078 3759596 49176.961039 3786626
Oregon 106610.333333 3837972 107458.583333 3868509
Pennsylvania 189731.552239 12712014 190226.895522 12745202
Rhode Island 210643.800000 1053219 210371.200000 1051856
South Carolina 100780.304348 4635894 101581.152174 4672733
South Dakota 12368.166667 816299 12489.227273 824289
Tennessee 66911.421053 6356585 67351.663158 6398408
Texas 99387.255906 25244363 101001.826772 25654464
Utah 95704.344828 2775426 97118.620690 2816440
Vermont 44713.142857 625984 44763.357143 626687
Virginia 60344.263158 8025787 60983.330827 8110783
Washington 172898.974359 6743060 174954.589744 6823229
West Virginia 33713.181818 1854225 33726.327273 1854948
Wisconsin 79030.611111 5690204 79301.666667 5709720
Wyoming 24544.173913 564516 24685.565217 567768
In [68]:
#如果将函数label改为column的名称,那么就只是对2010求平均,对2011求和,与上面的分别求和、求平均是不一样的
#这时,pandas自动识别column与function label是一样的,不会新建column而是直接对其应用函数
#在使用agg时候需要注意这一点
(df.set_index("STNAME").groupby(level = 0)["POPESTIMATE2010", "POPESTIMATE2011"]
.agg({"POPESTIMATE2010":np.average, "POPESTIMATE2011":np.sum}))
<ipython-input-68-752bd9c859f7>:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  (df.set_index("STNAME").groupby(level = 0)["POPESTIMATE2010", "POPESTIMATE2011"]
Out[68]:
POPESTIMATE2010 POPESTIMATE2011
STNAME
Alabama 71420.313433 4801108
Alaska 24621.413793 722720
Arizona 427213.866667 6468732
Arkansas 38965.253333 2938538
California 643691.017241 37700034
Colorado 78878.968750 5119480
Connecticut 447464.625000 3589759
Delaware 299930.333333 907916
District of Columbia 605126.000000 620472
Florida 281341.641791 19105533
Georgia 61090.905660 9812280
Hawaii 272796.000000 1378227
Idaho 35704.227273 1584134
Illinois 125894.598039 12861882
Indiana 70549.891304 6516845
Iowa 30815.090909 3065389
Kansas 27226.895238 2869917
Kentucky 36232.808333 4367882
Louisiana 71014.859375 4575381
Maine 82980.937500 1328257
Maryland 241183.708333 5844171
Massachusetts 468931.142857 6611797
Michigan 119004.445783 9876589
Minnesota 61044.862069 5348119
Mississippi 36223.365854 2977999
Missouri 52139.582609 6010587
Montana 17690.053571 997746
Nebraska 19677.688172 1842383
Nevada 159025.882353 2718819
New Hampshire 131670.800000 1318344
New Jersey 419232.428571 8842934
New Mexico 62567.909091 2078226
New York 312950.322581 19523202
North Carolina 95589.790000 9651025
North Dakota 12726.981132 685326
Ohio 131145.068182 11545442
Oklahoma 48825.922078 3786626
Oregon 106610.333333 3868509
Pennsylvania 189731.552239 12745202
Rhode Island 210643.800000 1051856
South Carolina 100780.304348 4672733
South Dakota 12368.166667 824289
Tennessee 66911.421053 6398408
Texas 99387.255906 25654464
Utah 95704.344828 2816440
Vermont 44713.142857 626687
Virginia 60344.263158 8110783
Washington 172898.974359 6823229
West Virginia 33713.181818 1854948
Wisconsin 79030.611111 5709720
Wyoming 24544.173913 567768
In [ ]: