import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
mpl.rcParams['figure.figsize'] = (20,5)
import statsmodels.api as sm # For Regression Modeling
import datetime # For converting and extracting datetime data
df = pd.read_csv('c:/load/ecommerce.csv',parse_dates=["event_time"])
df['event_time'].dtypes
datetime64[ns, UTC]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20692840 entries, 0 to 20692839 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 Unnamed: 0 int64 1 event_time datetime64[ns, UTC] 2 event_type object 3 product_id int64 4 category_id int64 5 category_code object 6 brand object 7 price float64 8 user_id int64 9 user_session object dtypes: datetime64[ns, UTC](1), float64(1), int64(4), object(4) memory usage: 1.5+ GB
df.drop(columns=['Unnamed: 0'],inplace=True)
df.columns
Index(['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session'], dtype='object')
df.isnull().sum()
event_time 0 event_type 0 product_id 0 category_id 0 category_code 20339246 brand 8757117 price 0 user_id 0 user_session 4598 dtype: int64
df.isnull().sum()/df.shape[0] * 100
event_time 0.000000 event_type 0.000000 product_id 0.000000 category_id 0.000000 category_code 98.291225 brand 42.319551 price 0.000000 user_id 0.000000 user_session 0.022220 dtype: float64
if 'category_code' in df:
df.drop(columns=['category_code'],inplace=True)
df.columns
Index(['event_time', 'event_type', 'product_id', 'category_id', 'brand', 'price', 'user_id', 'user_session'], dtype='object')
if 'brand' in df:
df.drop(columns=['brand'],inplace=True)
df.columns
Index(['event_time', 'event_type', 'product_id', 'category_id', 'price', 'user_id', 'user_session'], dtype='object')
df.drop_duplicates(inplace=True)
print('There are now ' + str(df.shape[0]) + ' rows.')
There are now 19583742 rows.
df['event_type'] = df['event_type'].str.strip()
df['user_session'] = df['user_session'].str.strip()
plt.hist(df['price'], 50, density=True, facecolor='g', alpha=0.75)
plt.title('Price Histogram', fontsize = 18)
plt.xlabel('Euro Bins', fontsize = 16)
plt.ylabel('Frequency', fontsize = 16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
df.event_type.value_counts().loc[['view', 'cart', 'remove_from_cart','purchase']].plot.bar(xlabel = "Events", ylabel = "Totals, 1.0 = 10 Million Events", title = "Bar chart Event Totals", rot = 45,fontsize = 16);
round(df['event_type'].value_counts(normalize = True),2)
view 0.49 cart 0.29 remove_from_cart 0.15 purchase 0.07 Name: event_type, dtype: float64
df['event_type'].value_counts().sort_values(ascending=False)
view 9656790 cart 5652991 remove_from_cart 2987859 purchase 1286102 Name: event_type, dtype: int64
df.price.max()
print('The highest price in the dataset is: '+ '$' + str(df.price.max()))
The highest price in the dataset is: $327.78
df.price.min()
print('The lowest price in the dataset is: ' + '$' + str(df.price.min()))
The lowest price in the dataset is: $-79.37
df.sort_values(by=['price']).head()
event_time | event_type | product_id | category_id | price | user_id | user_session | |
---|---|---|---|---|---|---|---|
13351304 | 2020-02-07 21:52:23+00:00 | purchase | 5716861 | 1487580014042939619 | -79.37 | 602831022 | c07648b8-c372-4936-b5ae-78b0f4236af9 |
4283228 | 2019-11-06 17:31:51+00:00 | purchase | 5716861 | 1487580014042939619 | -79.37 | 554241372 | b85719db-2c56-460b-b800-d7a47d4c4d70 |
19850269 | 2020-01-26 18:55:52+00:00 | purchase | 5716861 | 1487580014042939619 | -79.37 | 606448681 | 466cd335-b273-49a4-aefb-3077b5fcd87e |
17952565 | 2020-01-13 13:03:15+00:00 | purchase | 5716861 | 1487580014042939619 | -79.37 | 489184067 | 36f0ade4-fe9f-4353-9ddb-a6600e11b314 |
12938589 | 2020-02-05 07:57:45+00:00 | purchase | 5716861 | 1487580014042939619 | -79.37 | 562535575 | d243bb1b-2ee9-4c21-9b2a-6460469e9a75 |
df = df[~(df['price'] <= 0)] # Removing non-positive price value rows
print('There are now ' + str(df.shape[0]) + ' rows and ' + str(df.shape[1]) + ' columns.')
There are now 19497292 rows and 7 columns.
df.sort_values(by=['price'],ascending = True).head()
event_time | event_type | product_id | category_id | price | user_id | user_session | |
---|---|---|---|---|---|---|---|
8780080 | 2019-10-05 09:58:08+00:00 | cart | 5603938 | 1487580007717929935 | 0.05 | 556992916 | 2ff16c5f-d79e-4a44-ac2b-38ede7547a47 |
16054042 | 2020-02-27 10:12:14+00:00 | purchase | 5603934 | 1487580007717929935 | 0.05 | 565745379 | 5019ff7a-ca64-4098-ae8d-383c76d27500 |
8003423 | 2019-11-29 22:50:37+00:00 | view | 5603938 | 1487580007717929935 | 0.05 | 554216262 | 213f3248-fe64-4cda-a41b-1ac4f984610c |
993262 | 2019-12-08 19:38:03+00:00 | remove_from_cart | 5603937 | 1487580007717929935 | 0.05 | 518113830 | 313f0460-d950-4e63-93e3-852014a95338 |
16054043 | 2020-02-27 10:12:14+00:00 | purchase | 5603937 | 1487580007717929935 | 0.05 | 565745379 | 5019ff7a-ca64-4098-ae8d-383c76d27500 |
print('The average price of a product is '+ str(int(df['price'].mean() ))+ ' Euros.')
The average price of a product is 8 Euros.
counter1 = len(df['event_type'].unique())
print('There are ' + str(counter1) + ' unique values in the ' + 'event_type column')
counter2 = len(df['product_id'].unique())
print('There are ' + str(counter2) + ' unique values in the ' + 'product_id column')
counter3 = len(df['category_id'].unique())
print('There are ' + str(counter3) + ' unique values in the ' + 'category_id column')
counter4 = len(df['price'].unique())
print('There are ' + str(counter4) + ' unique values in the ' + 'price column')
counter5 = len(df['user_id'].unique())
print('There are ' + str(counter5) + ' unique values in the ' + 'user_id column')
counter6 = len(df['user_session'].unique())
print('There are ' + str(counter6) + ' unique values in the ' + 'user_session column')
There are 4 unique values in the event_type column There are 53904 unique values in the product_id column There are 523 unique values in the category_id column There are 2854 unique values in the price column There are 1637977 unique values in the user_id column There are 4520285 unique values in the user_session column
event_types = df['event_type'].unique()
print('Unique Event type: ' + event_types[0])
print('Unique Event type: ' + event_types[1])
print('Unique Event type: ' + event_types[2])
print('Unique Event type: ' + event_types[3])
Unique Event type: remove_from_cart Unique Event type: view Unique Event type: cart Unique Event type: purchase
print('There are ' + str(len(df['user_id'].unique())) + ' unique Users.')
There are 1637977 unique Users.
df['purchased_yn'] = np.where(df['event_type']=='purchase', 1, 0)
df['purchased_yn'].value_counts()
0 18211310 1 1285982 Name: purchased_yn, dtype: int64
purchase_event_counts = df['purchased_yn'].value_counts()
print('There are ' + str(purchase_event_counts[1]) + ' purchase events and ' + str(purchase_event_counts[0]) + ' other events.')
purchase_event_percent = round((purchase_event_counts[1]) / (purchase_event_counts[0]),2) * 100
print('That is ' + str(purchase_event_percent) + '% of all events.')
There are 1285982 purchase events and 18211310 other events. That is 7.000000000000001% of all events.
# Total number of purchases
total_num_purchases = len(df[(df.purchased_yn == 1)]) # 1 = Purchased
print('Total number of purchases is ' + str(total_num_purchases))
# Total number of purchases per day
total_num_purchases_daily = round(total_num_purchases / 152) # 152 days = 5 months of this dataset = purchases per day
print('Total number of purchases per day is ' + str(total_num_purchases_daily))
# Total number of purchases per month
total_num_purchases_monthly = round((total_num_purchases / 5))
print('Total number of purchases per month is ' + str(total_num_purchases_monthly))
Total number of purchases is 1285982 Total number of purchases per day is 8460 Total number of purchases per month is 257196
# Number of purchases per User
df_g = df.query('purchased_yn == 1') # Query only the purchased (=1) items.
df_g = df_g.filter(['user_id','purchased_yn']) # Get only the needed columns.
# Number of purchases specific to the user_id
user_purchase_df = pd.DataFrame(df_g.groupby('user_id')['purchased_yn'].count()).reset_index()
# Merge the dataframes
user_purchase_df.columns = ['user_id','total_purchased']
df_joined = df.merge(user_purchase_df, how='left', on='user_id')
## Note primary dataframe name is now df_joined
# Calculating average number of purchases
total_gotten = df_joined['total_purchased'].sum()
count_purchased = len(df_joined['total_purchased']==1)
avg_purchased = round(total_gotten / count_purchased,2)
print('Average number of purchases per user is: ' + (str(avg_purchased)))
Average number of purchases per user is: 18.94
# Number of sessions per User
df_h = df_joined.query('user_session > "1"') # Query only the user_sessions
df_h = df_h.filter(['user_id','user_session']) # Get only the needed columns.
# The number of sessions specific to (per) the user_id
user_session_df = pd.DataFrame(df_h.groupby('user_id')['user_session'].count()).reset_index()
# Merge the dataframes
user_session_df.columns = ['user_id','session_per_user']
df_joined = df_joined.merge(user_session_df, how='left', on='user_id')
df_joined
event_time | event_type | product_id | category_id | price | user_id | user_session | purchased_yn | total_purchased | session_per_user | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-12-01 00:00:00+00:00 | remove_from_cart | 5712790 | 1487580005268456287 | 6.27 | 576802932 | 51d85cb0-897f-48d2-918b-ad63965c12dc | 0 | 64.0 | 1266.0 |
1 | 2019-12-01 00:00:00+00:00 | view | 5764655 | 1487580005411062629 | 29.05 | 412120092 | 8adff31e-2051-4894-9758-224bfa8aec18 | 0 | 9.0 | 93.0 |
2 | 2019-12-01 00:00:02+00:00 | cart | 4958 | 1487580009471148064 | 1.19 | 494077766 | c99a50e8-2fac-4c4d-89ec-41c05f114554 | 0 | 50.0 | 127.0 |
3 | 2019-12-01 00:00:05+00:00 | view | 5848413 | 1487580007675986893 | 0.79 | 348405118 | 722ffea5-73c0-4924-8e8f-371ff8031af4 | 0 | NaN | 85.0 |
4 | 2019-12-01 00:00:07+00:00 | view | 5824148 | 1487580005511725929 | 5.56 | 576005683 | 28172809-7e4a-45ce-bab0-5efa90117cd5 | 0 | NaN | 57.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19497287 | 2020-01-31 23:59:52+00:00 | view | 5796984 | 1487580005671109489 | 1.73 | 417102560 | dcf90ff3-a246-4b00-b39a-83e6444a0769 | 0 | NaN | 26.0 |
19497288 | 2020-01-31 23:59:52+00:00 | remove_from_cart | 5886774 | 1487580006317032337 | 1.59 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 |
19497289 | 2020-01-31 23:59:53+00:00 | view | 5875432 | 2084144451428549153 | 2.05 | 423651741 | fb42a963-abef-4c4f-b1ba-f5812dd54e80 | 0 | NaN | 85.0 |
19497290 | 2020-01-31 23:59:57+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 |
19497291 | 2020-01-31 23:59:58+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 |
19497292 rows × 10 columns
# Making a column based on which event the user reached
conditions = [
(df_joined['event_type'] =='view'), # = 1
(df_joined['event_type'] =='cart'), # = 2
(df_joined['event_type'] =='remove_from_cart'), # = 3
(df_joined['event_type'] =='purchase')] # = 4
choices = [1,2,3,4]
df_joined['event_type_level'] = np.select(conditions,choices,default = 1)
# Adding event_level per user.
event_level_per_user_df = pd.DataFrame(df_joined.groupby('user_id')['event_type_level'].nunique()).reset_index()
event_level_per_user_df
# Merge it back into main dataframe
event_level_per_user_df.columns = ['user_id','event_type_stage']
df_joined = df_joined.merge(event_level_per_user_df, how='left', on='user_id')
df_joined
event_time | event_type | product_id | category_id | price | user_id | user_session | purchased_yn | total_purchased | session_per_user | event_type_level | event_type_stage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-12-01 00:00:00+00:00 | remove_from_cart | 5712790 | 1487580005268456287 | 6.27 | 576802932 | 51d85cb0-897f-48d2-918b-ad63965c12dc | 0 | 64.0 | 1266.0 | 3 | 4 |
1 | 2019-12-01 00:00:00+00:00 | view | 5764655 | 1487580005411062629 | 29.05 | 412120092 | 8adff31e-2051-4894-9758-224bfa8aec18 | 0 | 9.0 | 93.0 | 1 | 4 |
2 | 2019-12-01 00:00:02+00:00 | cart | 4958 | 1487580009471148064 | 1.19 | 494077766 | c99a50e8-2fac-4c4d-89ec-41c05f114554 | 0 | 50.0 | 127.0 | 2 | 4 |
3 | 2019-12-01 00:00:05+00:00 | view | 5848413 | 1487580007675986893 | 0.79 | 348405118 | 722ffea5-73c0-4924-8e8f-371ff8031af4 | 0 | NaN | 85.0 | 1 | 3 |
4 | 2019-12-01 00:00:07+00:00 | view | 5824148 | 1487580005511725929 | 5.56 | 576005683 | 28172809-7e4a-45ce-bab0-5efa90117cd5 | 0 | NaN | 57.0 | 1 | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19497287 | 2020-01-31 23:59:52+00:00 | view | 5796984 | 1487580005671109489 | 1.73 | 417102560 | dcf90ff3-a246-4b00-b39a-83e6444a0769 | 0 | NaN | 26.0 | 1 | 2 |
19497288 | 2020-01-31 23:59:52+00:00 | remove_from_cart | 5886774 | 1487580006317032337 | 1.59 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 |
19497289 | 2020-01-31 23:59:53+00:00 | view | 5875432 | 2084144451428549153 | 2.05 | 423651741 | fb42a963-abef-4c4f-b1ba-f5812dd54e80 | 0 | NaN | 85.0 | 1 | 3 |
19497290 | 2020-01-31 23:59:57+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 |
19497291 | 2020-01-31 23:59:58+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 |
19497292 rows × 12 columns
# Average event time per user
df_joined['aet_per_user'] = df_joined.event_time.dt.minute.rolling(window=3).mean()
df_aet = df_joined.filter(['aet_per_user','user_id']) # Get only the needed columns.
# Average event duration specific to (per) the user_id
avg_event_time_df = pd.DataFrame(df_aet.groupby('user_id')['aet_per_user'].mean().reset_index())
# Merge the dataframes
avg_event_time_df.columns = ['user_id','avg_evt_dur_per_user']
df_joined = df_joined.merge(avg_event_time_df, how='left', on='user_id')
df_joined
event_time | event_type | product_id | category_id | price | user_id | user_session | purchased_yn | total_purchased | session_per_user | event_type_level | event_type_stage | aet_per_user | avg_evt_dur_per_user | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-12-01 00:00:00+00:00 | remove_from_cart | 5712790 | 1487580005268456287 | 6.27 | 576802932 | 51d85cb0-897f-48d2-918b-ad63965c12dc | 0 | 64.0 | 1266.0 | 3 | 4 | NaN | 28.854893 |
1 | 2019-12-01 00:00:00+00:00 | view | 5764655 | 1487580005411062629 | 29.05 | 412120092 | 8adff31e-2051-4894-9758-224bfa8aec18 | 0 | 9.0 | 93.0 | 1 | 4 | NaN | 31.261649 |
2 | 2019-12-01 00:00:02+00:00 | cart | 4958 | 1487580009471148064 | 1.19 | 494077766 | c99a50e8-2fac-4c4d-89ec-41c05f114554 | 0 | 50.0 | 127.0 | 2 | 4 | 0.0 | 26.860892 |
3 | 2019-12-01 00:00:05+00:00 | view | 5848413 | 1487580007675986893 | 0.79 | 348405118 | 722ffea5-73c0-4924-8e8f-371ff8031af4 | 0 | NaN | 85.0 | 1 | 3 | 0.0 | 31.337255 |
4 | 2019-12-01 00:00:07+00:00 | view | 5824148 | 1487580005511725929 | 5.56 | 576005683 | 28172809-7e4a-45ce-bab0-5efa90117cd5 | 0 | NaN | 57.0 | 1 | 3 | 0.0 | 35.134503 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19497287 | 2020-01-31 23:59:52+00:00 | view | 5796984 | 1487580005671109489 | 1.73 | 417102560 | dcf90ff3-a246-4b00-b39a-83e6444a0769 | 0 | NaN | 26.0 | 1 | 2 | 59.0 | 30.269231 |
19497288 | 2020-01-31 23:59:52+00:00 | remove_from_cart | 5886774 | 1487580006317032337 | 1.59 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 | 59.0 | 44.251142 |
19497289 | 2020-01-31 23:59:53+00:00 | view | 5875432 | 2084144451428549153 | 2.05 | 423651741 | fb42a963-abef-4c4f-b1ba-f5812dd54e80 | 0 | NaN | 85.0 | 1 | 3 | 59.0 | 29.880759 |
19497290 | 2020-01-31 23:59:57+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 | 59.0 | 44.251142 |
19497291 | 2020-01-31 23:59:58+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 | 59.0 | 44.251142 |
19497292 rows × 14 columns
df_joined['avg_evt_dur_per_user'].mean()
print('Average event time per user in minutes is: ' + str(df_joined['avg_evt_dur_per_user'].mean()))
Average event time per user in minutes is: 29.64472128317097
# Day of week, most purchases, per user
df_joined['max_purch_dow_per_user']= df_joined['event_time'].dt.dayofweek
max_purch_dow = df_joined.filter(['max_purch_dow_per_user','user_id']) # Get only the needed columns.
# day of week of most purchases specific to (per) the user_id
max_purch_dow_per_user_df = pd.DataFrame(max_purch_dow.groupby('user_id')['max_purch_dow_per_user'].max().reset_index())
# Merge the dataframes
max_purch_dow_per_user_df.columns = ['user_id','max_purch_dayofweek']
df_joined = df_joined.merge(max_purch_dow_per_user_df, how='left', on='user_id')
df_joined
event_time | event_type | product_id | category_id | price | user_id | user_session | purchased_yn | total_purchased | session_per_user | event_type_level | event_type_stage | aet_per_user | avg_evt_dur_per_user | max_purch_dow_per_user | max_purch_dayofweek | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-12-01 00:00:00+00:00 | remove_from_cart | 5712790 | 1487580005268456287 | 6.27 | 576802932 | 51d85cb0-897f-48d2-918b-ad63965c12dc | 0 | 64.0 | 1266.0 | 3 | 4 | NaN | 28.854893 | 6 | 6 |
1 | 2019-12-01 00:00:00+00:00 | view | 5764655 | 1487580005411062629 | 29.05 | 412120092 | 8adff31e-2051-4894-9758-224bfa8aec18 | 0 | 9.0 | 93.0 | 1 | 4 | NaN | 31.261649 | 6 | 6 |
2 | 2019-12-01 00:00:02+00:00 | cart | 4958 | 1487580009471148064 | 1.19 | 494077766 | c99a50e8-2fac-4c4d-89ec-41c05f114554 | 0 | 50.0 | 127.0 | 2 | 4 | 0.0 | 26.860892 | 6 | 6 |
3 | 2019-12-01 00:00:05+00:00 | view | 5848413 | 1487580007675986893 | 0.79 | 348405118 | 722ffea5-73c0-4924-8e8f-371ff8031af4 | 0 | NaN | 85.0 | 1 | 3 | 0.0 | 31.337255 | 6 | 6 |
4 | 2019-12-01 00:00:07+00:00 | view | 5824148 | 1487580005511725929 | 5.56 | 576005683 | 28172809-7e4a-45ce-bab0-5efa90117cd5 | 0 | NaN | 57.0 | 1 | 3 | 0.0 | 35.134503 | 6 | 6 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19497287 | 2020-01-31 23:59:52+00:00 | view | 5796984 | 1487580005671109489 | 1.73 | 417102560 | dcf90ff3-a246-4b00-b39a-83e6444a0769 | 0 | NaN | 26.0 | 1 | 2 | 59.0 | 30.269231 | 4 | 5 |
19497288 | 2020-01-31 23:59:52+00:00 | remove_from_cart | 5886774 | 1487580006317032337 | 1.59 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 | 59.0 | 44.251142 | 4 | 6 |
19497289 | 2020-01-31 23:59:53+00:00 | view | 5875432 | 2084144451428549153 | 2.05 | 423651741 | fb42a963-abef-4c4f-b1ba-f5812dd54e80 | 0 | NaN | 85.0 | 1 | 3 | 59.0 | 29.880759 | 4 | 5 |
19497290 | 2020-01-31 23:59:57+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 | 59.0 | 44.251142 | 4 | 6 |
19497291 | 2020-01-31 23:59:58+00:00 | remove_from_cart | 5820745 | 1487580006317032337 | 2.22 | 607092857 | a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47 | 0 | 11.0 | 57.0 | 3 | 4 | 59.0 | 44.251142 | 4 | 6 |
19497292 rows × 16 columns
df_joined['max_purch_dayofweek'].plot(kind = 'hist', bins = 7)
plt.title('Histogram Day of the week of most purchases', fontsize = 18)
plt.ylabel('Frequency 1 = 1 Million Events')
plt.xlabel('Day of the week, 0 = Monday', fontsize = 14);
df_joined.boxplot(column='price', return_type='axes',fontsize = 16)
plt.title('Price Box plot with outliers',fontsize = 16)
plt.ylabel('Euros',fontsize = 16)
plt.show()
Q1 = df_joined.price.quantile(0.25)
Q3 = df_joined.price.quantile(0.75)
IQR = Q3 - Q1
Lower_Limit = Q1 - 1.5 * IQR
Upper_Limit = Q3 + 1.5 * IQR
cols = list(['price'])
no_outliers = df_joined.price[~((df_joined[cols] < (Lower_Limit)) | (df_joined[cols] > (Upper_Limit))).any(axis=1)]
outliers_percent = len(no_outliers)/len(df_joined)
percentage = int(outliers_percent * 100)
print('The percentage of the dataset left after outliers are removed from "price" column is: ' + str(percentage) + '%')
The percentage of the dataset left after outliers are removed from "price" column is: 89%
no_outliers.plot(kind='box',fontsize = 16)
plt.title('Price Box Plot with fewer upper outliers',fontsize = 16)
plt.ylabel('Euros',fontsize = 16)
plt.show()
df_corr = df_joined.corr()
sns.set(font_scale=1.2)
sns.heatmap(df_corr, annot=True, annot_kws={"size":16})
<AxesSubplot:>
df_corr_features = df_joined.corr()
df_corr_features['purchased_yn'].apply(abs).sort_values(ascending=False).plot(kind='bar')
plt.show()
df_joined = df_joined.dropna()
# independent_variables = df_joined['effectiveness'] # R2 = 0.002
# independent_variables = df_joined[['event_type_level','price','user_event_type_level','min_price_per_user']] #R2 = 0.403
# independent_variables = df_joined[['event_type_level','user_event_type_level','price','min_price_per_product','total_purchased','min_price_per_user','session_per_user']] #R2 = 0.509
# independent_variables = df_joined[['event_type_level','price','total_purchased','min_price_per_user']] # R2 = 0.501
# independent_variables = df_joined[['event_type_level','price','min_price_per_user']] # R2 = 0.501
# independent_variables = df_joined[['event_type_level','price','min_price_per_user','avg_evt_dur_per_user']] # R2 = 0.501
# independent_variables = df_joined[['event_type_level','price','avg_evt_dur_per_user','max_purch_dow_per_user_y']] # R2 = 0.504
# independent_variables = df_joined[['event_type_level','user_event_type_level','effectiveness']] # R2 = 0.505
# independent_variables = df_joined[['event_type_level','user_event_type_level','effectiveness','max_purch_dayofweek']] # R2 = 0.507
# independent_variables = df_joined[['event_type_stage','effectiveness']] # R2 = 0.500
# independent_variables = df_joined[['event_type_stage','event_type_level']] # R2 = 0.505
independent_variables = df_joined['event_type_level'] # R2 = 0.499
dependent_variable = df_joined['purchased_yn']
independent_variables = sm.add_constant(independent_variables)
regression_model = sm.OLS(dependent_variable,independent_variables).fit()
regression_model.summary()
Dep. Variable: | purchased_yn | R-squared: | 0.499 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.499 |
Method: | Least Squares | F-statistic: | 1.072e+07 |
Date: | Mon, 14 Mar 2022 | Prob (F-statistic): | 0.00 |
Time: | 13:23:54 | Log-Likelihood: | 5.9852e+05 |
No. Observations: | 10748521 | AIC: | -1.197e+06 |
Df Residuals: | 10748519 | BIC: | -1.197e+06 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -0.3352 | 0.000 | -2159.743 | 0.000 | -0.335 | -0.335 |
event_type_level | 0.2235 | 6.83e-05 | 3274.883 | 0.000 | 0.223 | 0.224 |
Omnibus: | 322831.910 | Durbin-Watson: | 1.074 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 247703.815 |
Skew: | 0.284 | Prob(JB): | 0.00 |
Kurtosis: | 2.521 | Cond. No. | 5.86 |
Achieving accuracy of more than 50 percent is improbable in cases where prediction is related to psychology, as we don't know the user's mindset at the time of purchase. In some fields, it is entirely expected that an R-squared value would be low. For example, any field that attempts to predict human behavior. That is precisely what we are doing in this analysis. Considering this 0.499 is a good fit to the model.
The coefficient for event_type_level shows that for every single unit increase the Y dependant variable (purchase or no purchase) will increase by 22 units.
Event Type percentages:
| view | 0.49 || cart | 0.29 || remove_from_cart | 0.15 || purchase | 0.07 |
Expensive shipping is a significant reason why users will remove products from the cart.
A key would be to analyze why users remove nearly 1/2 of the cart contents before making a purchase. This may indicate that shipping prices are too high versus the total amount of the purchase. For example, a user may find it difficult to justify an 8 Euro purchase while having to pay 10 additional Euros for shipping.
Get 1 or more years of data. This datasets purchase events are a very small percentage of the dataset.
Online user polling may help to reveal any user/usage 'pain points'.
df_joined.to_csv('./ecommerce_analyzed.csv', encoding='utf-8')