I have the following data:
import pandas as pd
import numpy as np
from numpy import rec, nan
a=rec.array([(202104L, 'tCompany_A', 'b2b', 'fast', 741246713L, 3.0, 50.75),
(202108L, 'tCompany_A', 'b2c', 'fast', 741246713L, 5.0, 91.81),
(202109L, 'tCompany_A', 'b2b', 'fast', 741246713L, 6.0, 85.8),
(202109L, 'tCompany_A', 'b2c', 'fast', 741246713L, 0.19995117199999998, 53.65),
(202112L, 'tCompany_A', 'b2c', 'fast', 741246713L, 0.09997558599999999, 56.5),
(202110L, 'Company_B', 'b2c', 'fast', 790395875L, 11.0, 75.03),
(202201L, 'Company_B', 'b2b', 'fast', 790395875L, 13.0, 57.33),
(202202L, 'Company_B', 'b2b', 'fast', 790395875L, 0.09997558599999999, 69.52),
(202203L, 'Company_B', 'b2b', 'fast', 790395875L, 0.09997558599999999, 49.47),
(201810L, 'Company_C', 'b2c', 'fast', 252364056L, 3.0, 42.78),
(202002L, 'Company_C', 'b2c', 'fast', 252364056L, 0.09997558599999999, 39.53),
(202005L, 'Company_D', 'b2b', 'fast', 718447908L, 5.0, 30.55),
(202006L, 'Company_D', 'b2b', 'fast', 718447908L, 6.0, 30.55),
(202007L, 'Company_D', 'b2b', 'fast', 718447908L, 7.0, 30.54),
(202008L, 'Company_D', 'b2b', 'fast', 718447908L, 11.0, 30.55),
(202010L, 'Company_D', 'b2b', 'fast', 718447908L, 0.09997558599999999, 34.8),
(202011L, 'Company_D', 'b2b', 'fast', 718447908L, 12.0, 30.55),
(202012L, 'Company_D', 'b2b', 'fast', 718447908L, 12.0, 30.55),
(202101L, 'Company_D', 'b2b', 'fast', 718447908L, 12.0, 32.25),
(202103L, 'Company_D', 'b2b', 'fast', 718447908L, 12.0, 32.25),
(202104L, 'Company_D', 'b2b', 'fast', 718447908L, 12.0, 32.25),
(202105L, 'Company_D', 'b2b', 'fast', 718447908L, 4.0, 57.3),
(202106L, 'Company_D', 'b2b', 'fast', 718447908L, 0.09997558599999999, 70.78),
(202110L, 'Company_E', 'b2c', 'fast', 392704663L, 3.0, 42.48),
(201810L, 'Company_F', 'b2c', 'fast', 490721459L, 4.0, 32.74),
(201812L, 'Company_F', 'b2c', 'fast', 490721459L, 0.09997558599999999, 83.3),
(201812L, 'Company_G', 'b2b', 'fast', 103042224L, 0.09997558599999999, 39.37),
(201908L, 'Company_H', 'b2b', 'fast', 696853886L, 3.0, 77.34),
(201908L, 'Company_H', 'b2c', 'fast', 696853886L, 0.19995117199999998, 49.39),
(202111L, 'Company_H', 'b2b', 'fast', 696853886L, 2.0, 51.95),
(202202L, 'Company_H', 'b2c', 'fast', 696853886L, 0.09997558599999999, 49.5),
(201902L, 'Company_I', 'b2b', 'fast', 490964121L, 0.5, 53.15),
(201903L, 'Company_I', 'b2b', 'fast', 490964121L, 1.0, 26.96),
(201904L, 'Company_I', 'b2b', 'fast', 490964121L, 2.0, 56.64),
(201905L, 'Company_I', 'b2b', 'fast', 490964121L, 0.19995117199999998, 64.62),
(201906L, 'Company_I', 'b2b', 'fast', 490964121L, 0.600097656, 61.45),
(201911L, 'Company_I', 'b2b', 'fast', 490964121L, 3.0, 110.65),
(201912L, 'Company_I', 'b2b', 'fast', 490964121L, 4.0, 57.09),
(202002L, 'Company_I', 'b2c', 'fast', 490964121L, 0.09997558599999999, 61.97),
(202003L, 'Company_I', 'b2b', 'fast', 490964121L, 7.0, 42.07),
(202003L, 'Company_I', 'b2c', 'fast', 490964121L, 8.0, 28.91),
(202011L, 'Company_I', 'b2b', 'fast', 490964121L, 9.0, 44.8),
(202011L, 'Company_I', 'b2c', 'fast', 490964121L, 11.0, 45.83),
(202104L, 'Company_I', 'b2c', 'fast', 490964121L, 0.09997558599999999, 34.8),
(202106L, 'Company_I', 'b2b', 'fast', 490964121L, 0.09997558599999999, 40.52),
(201908L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 32.66),
(201909L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 32.66),
(201910L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 32.66),
(201911L, 'Company_J', 'b2b', 'fast', 47700086L, 0.19995117199999998, 32.69),
(201912L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 32.66),
(202001L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 33.0),
(202002L, 'Company_J', 'b2b', 'fast', 47700086L, 0.300048828, 34.63),
(202003L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 34.37),
(202004L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 34.37),
(202005L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 31.62),
(202006L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 34.88),
(202007L, 'Company_J', 'b2b', 'fast', 47700086L, 23.0, 32.81),
(202008L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 34.37),
(202009L, 'Company_J', 'b2c', 'fast', 47700086L, 33.0, 37.4),
(202011L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.37),
(202012L, 'Company_J', 'b2b', 'fast', 47700086L, 0.19995117199999998, 34.68),
(202101L, 'Company_J', 'b2c', 'fast', 47700086L, 32.0, 36.21),
(202102L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 42.28),
(202103L, 'Company_J', 'b2b', 'fast', 47700086L, 0.300048828, 36.35),
(202104L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.35),
(202105L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.35),
(202106L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.35),
(202107L, 'Company_J', 'b2b', 'fast', 47700086L, 23.0, 34.7),
(202108L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.35),
(202109L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.35),
(202110L, 'Company_J', 'b2b', 'fast', 47700086L, 0.19995117199999998, 35.8),
(202111L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 34.93),
(202112L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 36.35),
(202201L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 43.95),
(202202L, 'Company_J', 'b2b', 'fast', 47700086L, 0.09997558599999999, 41.03),
(202203L, 'Company_J', 'b2b', 'fast', 47700086L, 0.39990234399999997, 42.19),
(202203L, 'Company_J', 'b2c', 'fast', 47700086L, 89.0, 44.83),
(201901L, 'Company_K', 'b2c', 'fast', 912093182L, 98.0, 38.06),
(201906L, 'Company_K', 'b2c', 'fast', 912093182L, 11.0, 72.42),
(201909L, 'Company_K', 'b2b', 'fast', 912093182L, 2.0, 31.83),
(202001L, 'Company_K', 'b2b', 'fast', 912093182L, 0.09997558599999999, 40.9),
(202005L, 'Company_K', 'b2c', 'fast', 912093182L, 0.09997558599999999, 68.41),
(202011L, 'Company_K', 'b2b', 'fast', 912093182L, 6.0, 36.01),
(202011L, 'Company_K', 'b2c', 'fast', 912093182L, 4.0, 95.27),
(202110L, 'Company_K', 'b2b', 'fast', 912093182L, 5.0, 65.73),
(202111L, 'Company_K', 'b2c', 'fast', 912093182L, 2.0, 54.41),
(202203L, 'Company_K', 'b2c', 'fast', 912093182L, 41.0, 52.99),
(202004L, 'Company_L', 'b2b', 'fast', 6533556L, 0.0, 42.7),
(202012L, 'Company_L', 'b2b', 'fast', 6533556L, 5.0, 54.0),
(201811L, 'Company_M', 'b2c', 'fast', 514344067L, 8.0, 47.54),
(201812L, 'Company_M', 'b2c', 'fast', 514344067L, 0.09997558599999999, 25.5),
(202001L, 'Company_M', 'b2b', 'fast', 514344067L, 3.0, 21.71),
(202006L, 'Company_M', 'b2c', 'fast', 514344067L, 2.0, 26.3),
(201811L, 'Company_N', 'b2c', 'fast', 6353793L, 5.0, 30.9),
(201903L, 'Company_N', 'b2c', 'fast', 6353793L, 6.0, 32.91),
(201904L, 'Company_N', 'b2c', 'fast', 6353793L, 8.0, 33.75),
(201905L, 'Company_N', 'b2b', 'fast', 6353793L, 7.0, 29.05),
(201906L, 'Company_N', 'b2b', 'fast', 6353793L, 8.0, 30.65),
(201906L, 'Company_N', 'b2c', 'fast', 6353793L, 0.0, 33.75),
(201907L, 'Company_N', 'b2c', 'fast', 6353793L, 8.0, 33.74)],
dtype=[('yyyymm', '<i8'), ('bus_nm', 'O'), ('b2b_b2c', 'O'), ('prod', 'O'), ('id', '<i8'), ('adv', '<f8'), ('pkg_yld', '<f8')])
df_final=pd.DataFrame.from_records(a)
df_final['b2b_b2c_prod'] = df_final['b2b_b2c'] + '_' + df_final['prod']
df_final
Here is a snapshot of the frame called ‘b2b_b2c_prod‘:
Next I create 2 dataframes df2_b2c_fast, df2_b2b_fast from the original b2b_b2c_prod:
for x in df_final['b2b_b2c_prod'].unique():
locals()['df2_' + x ] = df_final[(df_final['b2b_b2c_prod'] == x ) ]
and calculate the correlation between pkg_yld and ADV by business name, 1 of 2 arrays shown:
df_corrs_fast_b2c=df2_b2c_fast.groupby(['bus_nm','id']).corr(method='spearman').unstack().iloc[:,1]
df_corrs_fast_b2b=df2_b2b_fast.groupby(['bus_nm','id']).corr(method='spearman').unstack().iloc[:,1]
df_corrs_fast_b2c
bus_nm id
Company_A 741246713 0.500000
Company_B 790395875 NaN
Company_C 252364056 1.000000
Company_E 392704663 NaN
Company_F 490721459 -1.000000
Company_H 696853886 -1.000000
Company_I 490964121 -0.210819
Company_J 47700086 1.000000
Company_K 912093182 -0.542857
Company_M 514344067 1.000000
Company_N 6353793 0.131579
Name: (adv, pkg_yld), dtype: float64
Then convert each to dataframes:
corrs_b2b_fast = (df2_b2b_fast[['adv', 'id']] .groupby('id') .corrwith(df1['pkg_yld']) .rename(columns={'adv' : 'correl'}) .reset_index()) corrs_b2c_fast = (df2_b2c_fast[['adv', 'id']] .groupby('id') .corrwith(df1['pkg_yld']) .rename(columns={'adv' : 'correl'}) .reset_index())
Here is one of them:
Question: Can I generate the same results within a for loop to save steps?
I tried the following below but won’t work.
Here is the partial code:
mylist=[df2_b2b_fast, df2_b2c_fast] # Subset of dfs
for x in df_list['b2b_b2c_prod']:
locals()['corrs_' + x ] = df_list[(df_list['b2b_b2c_prod'] == x ) ] # Create new 2 dfs from main df 'b2b_b2c_prod'
x= x.groupby(['bus_nm','id']).corr(method='spearman').unstack().iloc[:,1] # calculate corr between pkg_yld and ADV
# stuck here...lines needed to create dataframes corrs_b2b_fast, corrs_b2c_fast??
What’s wrong/missing?