import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
mpl.rcParams['figure.figsize'] = (20,5)
import statsmodels.api as sm # For Regression Modeling
import datetime # For converting and extracting datetime data


df = pd.read_csv('c:/load/ecommerce.csv',parse_dates=["event_time"])


df['event_time'].dtypes

datetime64[ns, UTC]


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20692840 entries, 0 to 20692839
Data columns (total 10 columns):
 #   Column         Dtype              
---  ------         -----              
 0   Unnamed: 0     int64              
 1   event_time     datetime64[ns, UTC]
 2   event_type     object             
 3   product_id     int64              
 4   category_id    int64              
 5   category_code  object             
 6   brand          object             
 7   price          float64            
 8   user_id        int64              
 9   user_session   object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(4), object(4)
memory usage: 1.5+ GB


df.drop(columns=['Unnamed: 0'],inplace=True)


df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'user_session'],
      dtype='object')


df.isnull().sum()

event_time              0
event_type              0
product_id              0
category_id             0
category_code    20339246
brand             8757117
price                   0
user_id                 0
user_session         4598
dtype: int64


df.isnull().sum()/df.shape[0] * 100

event_time        0.000000
event_type        0.000000
product_id        0.000000
category_id       0.000000
category_code    98.291225
brand            42.319551
price             0.000000
user_id           0.000000
user_session      0.022220
dtype: float64


if 'category_code' in df:
    df.drop(columns=['category_code'],inplace=True)
df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id', 'brand',
       'price', 'user_id', 'user_session'],
      dtype='object')


if 'brand' in df:
    df.drop(columns=['brand'],inplace=True)
df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id', 'price',
       'user_id', 'user_session'],
      dtype='object')


df.drop_duplicates(inplace=True)
print('There are now ' + str(df.shape[0]) + ' rows.')

There are now 19583742 rows.


df['event_type'] = df['event_type'].str.strip()
df['user_session'] = df['user_session'].str.strip()


plt.hist(df['price'], 50, density=True, facecolor='g', alpha=0.75)

plt.title('Price Histogram', fontsize = 18)
plt.xlabel('Euro Bins', fontsize = 16)
plt.ylabel('Frequency', fontsize = 16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()


df.event_type.value_counts().loc[['view', 'cart', 'remove_from_cart','purchase']].plot.bar(xlabel = "Events", ylabel = "Totals, 1.0 = 10 Million Events", title = "Bar chart Event Totals", rot = 45,fontsize = 16);


round(df['event_type'].value_counts(normalize = True),2)

view                0.49
cart                0.29
remove_from_cart    0.15
purchase            0.07
Name: event_type, dtype: float64


df['event_type'].value_counts().sort_values(ascending=False)

view                9656790
cart                5652991
remove_from_cart    2987859
purchase            1286102
Name: event_type, dtype: int64


df.price.max()
print('The highest price in the dataset is: '+ '$' + str(df.price.max()))

The highest price in the dataset is: $327.78


df.price.min() 
print('The lowest price in the dataset is: ' + '$' + str(df.price.min()))

The lowest price in the dataset is: $-79.37


df.sort_values(by=['price']).head()


df = df[~(df['price'] <= 0)] # Removing non-positive price value rows


print('There are now ' + str(df.shape[0]) + ' rows and ' + str(df.shape[1]) + ' columns.')

There are now 19497292 rows and 7 columns.


df.sort_values(by=['price'],ascending = True).head()


print('The average price of a product is '+ str(int(df['price'].mean() ))+ ' Euros.')

The average price of a product is 8 Euros.


counter1 = len(df['event_type'].unique())
print('There are ' + str(counter1) + ' unique values in the ' + 'event_type column')
counter2 = len(df['product_id'].unique())
print('There are ' + str(counter2) + ' unique values in the ' + 'product_id column')
counter3 = len(df['category_id'].unique())
print('There are ' + str(counter3) + ' unique values in the ' + 'category_id column')
counter4 = len(df['price'].unique())
print('There are ' + str(counter4) + ' unique values in the ' + 'price column')
counter5 = len(df['user_id'].unique())
print('There are ' + str(counter5) + ' unique values in the ' + 'user_id column')
counter6 = len(df['user_session'].unique())
print('There are ' + str(counter6) + ' unique values in the ' + 'user_session column')

There are 4 unique values in the event_type column
There are 53904 unique values in the product_id column
There are 523 unique values in the category_id column
There are 2854 unique values in the price column
There are 1637977 unique values in the user_id column
There are 4520285 unique values in the user_session column


event_types = df['event_type'].unique()
print('Unique Event type: ' + event_types[0])
print('Unique Event type: ' + event_types[1])
print('Unique Event type: ' + event_types[2])
print('Unique Event type: ' + event_types[3])

Unique Event type: remove_from_cart
Unique Event type: view
Unique Event type: cart
Unique Event type: purchase


print('There are ' + str(len(df['user_id'].unique())) + ' unique Users.')

There are 1637977 unique Users.


df['purchased_yn'] = np.where(df['event_type']=='purchase', 1, 0)


df['purchased_yn'].value_counts()

0    18211310
1     1285982
Name: purchased_yn, dtype: int64


purchase_event_counts = df['purchased_yn'].value_counts()
print('There are ' + str(purchase_event_counts[1]) + ' purchase events and ' + str(purchase_event_counts[0]) + ' other events.') 
purchase_event_percent = round((purchase_event_counts[1]) / (purchase_event_counts[0]),2) * 100
print('That is ' + str(purchase_event_percent) + '% of all events.')

There are 1285982 purchase events and 18211310 other events.
That is 7.000000000000001% of all events.


# Total number of purchases
total_num_purchases = len(df[(df.purchased_yn == 1)]) # 1 = Purchased
print('Total number of purchases is ' + str(total_num_purchases))

# Total number of purchases per day
total_num_purchases_daily = round(total_num_purchases / 152) # 152 days = 5 months of this dataset = purchases per day
print('Total number of purchases per day is ' + str(total_num_purchases_daily))

# Total number of purchases per month
total_num_purchases_monthly = round((total_num_purchases / 5))
print('Total number of purchases per month is ' + str(total_num_purchases_monthly))

Total number of purchases is 1285982
Total number of purchases per day is 8460
Total number of purchases per month is 257196


# Number of purchases per User
df_g = df.query('purchased_yn == 1') # Query only the purchased (=1) items.
df_g = df_g.filter(['user_id','purchased_yn']) # Get only the needed columns.

# Number of purchases specific to the user_id
user_purchase_df = pd.DataFrame(df_g.groupby('user_id')['purchased_yn'].count()).reset_index()

# Merge the dataframes
user_purchase_df.columns = ['user_id','total_purchased']
df_joined = df.merge(user_purchase_df, how='left', on='user_id')

## Note primary dataframe name is now df_joined


# Calculating average number of purchases
total_gotten = df_joined['total_purchased'].sum()
count_purchased = len(df_joined['total_purchased']==1)

avg_purchased = round(total_gotten / count_purchased,2)
                                                 
print('Average number of purchases per user is: ' + (str(avg_purchased)))

Average number of purchases per user is: 18.94


# Number of sessions per User
df_h = df_joined.query('user_session > "1"') # Query only the user_sessions
df_h = df_h.filter(['user_id','user_session']) # Get only the needed columns.

# The number of sessions specific to (per) the user_id
user_session_df = pd.DataFrame(df_h.groupby('user_id')['user_session'].count()).reset_index()

# Merge the dataframes
user_session_df.columns = ['user_id','session_per_user']
df_joined = df_joined.merge(user_session_df, how='left', on='user_id')

df_joined


# Making a column based on which event the user reached
conditions = [
    (df_joined['event_type'] =='view'), # = 1
    (df_joined['event_type'] =='cart'), # = 2
    (df_joined['event_type'] =='remove_from_cart'), # = 3
    (df_joined['event_type'] =='purchase')] # = 4
choices = [1,2,3,4]
df_joined['event_type_level'] = np.select(conditions,choices,default = 1)
     
# Adding event_level per user.
event_level_per_user_df = pd.DataFrame(df_joined.groupby('user_id')['event_type_level'].nunique()).reset_index()
event_level_per_user_df

# Merge it back into main dataframe
event_level_per_user_df.columns = ['user_id','event_type_stage']
df_joined = df_joined.merge(event_level_per_user_df, how='left', on='user_id')


df_joined


# Average event time per user
df_joined['aet_per_user'] = df_joined.event_time.dt.minute.rolling(window=3).mean()
df_aet = df_joined.filter(['aet_per_user','user_id']) # Get only the needed columns.

# Average event duration specific to (per) the user_id
avg_event_time_df = pd.DataFrame(df_aet.groupby('user_id')['aet_per_user'].mean().reset_index())

# Merge the dataframes
avg_event_time_df.columns = ['user_id','avg_evt_dur_per_user']
df_joined = df_joined.merge(avg_event_time_df, how='left', on='user_id')

df_joined


df_joined['avg_evt_dur_per_user'].mean()
print('Average event time per user in minutes is: ' +  str(df_joined['avg_evt_dur_per_user'].mean()))

Average event time per user in minutes is: 29.64472128317097


# Day of week, most purchases, per user
df_joined['max_purch_dow_per_user']= df_joined['event_time'].dt.dayofweek
max_purch_dow = df_joined.filter(['max_purch_dow_per_user','user_id']) # Get only the needed columns.

# day of week of most purchases specific to (per) the user_id
max_purch_dow_per_user_df = pd.DataFrame(max_purch_dow.groupby('user_id')['max_purch_dow_per_user'].max().reset_index())

# Merge the dataframes
max_purch_dow_per_user_df.columns = ['user_id','max_purch_dayofweek']
df_joined = df_joined.merge(max_purch_dow_per_user_df, how='left', on='user_id')

df_joined


df_joined['max_purch_dayofweek'].plot(kind = 'hist', bins = 7)
plt.title('Histogram  Day of the week of most purchases', fontsize = 18)
plt.ylabel('Frequency 1 = 1 Million Events')
plt.xlabel('Day of the week, 0 = Monday', fontsize = 14);


df_joined.boxplot(column='price', return_type='axes',fontsize = 16)
plt.title('Price Box plot with outliers',fontsize = 16)
plt.ylabel('Euros',fontsize = 16)
plt.show()


Q1 = df_joined.price.quantile(0.25)
Q3 = df_joined.price.quantile(0.75)
IQR = Q3 - Q1

Lower_Limit = Q1 - 1.5 * IQR
Upper_Limit = Q3 + 1.5 * IQR

cols = list(['price'])
no_outliers = df_joined.price[~((df_joined[cols] < (Lower_Limit)) | (df_joined[cols] > (Upper_Limit))).any(axis=1)]

outliers_percent = len(no_outliers)/len(df_joined)
percentage = int(outliers_percent * 100)
print('The percentage of the dataset left after outliers are removed from "price" column is: ' + str(percentage) + '%')

The percentage of the dataset left after outliers are removed from "price" column is: 89%


no_outliers.plot(kind='box',fontsize = 16)
plt.title('Price Box Plot with fewer upper outliers',fontsize = 16)
plt.ylabel('Euros',fontsize = 16)
plt.show()


df_corr = df_joined.corr()
sns.set(font_scale=1.2)
sns.heatmap(df_corr, annot=True, annot_kws={"size":16})

<AxesSubplot:>


df_corr_features = df_joined.corr() 
df_corr_features['purchased_yn'].apply(abs).sort_values(ascending=False).plot(kind='bar') 
plt.show()


df_joined = df_joined.dropna()
# independent_variables = df_joined['effectiveness'] # R2 = 0.002
# independent_variables = df_joined[['event_type_level','price','user_event_type_level','min_price_per_user']] #R2 = 0.403
# independent_variables = df_joined[['event_type_level','user_event_type_level','price','min_price_per_product','total_purchased','min_price_per_user','session_per_user']] #R2 = 0.509
# independent_variables = df_joined[['event_type_level','price','total_purchased','min_price_per_user']] # R2 = 0.501
# independent_variables = df_joined[['event_type_level','price','min_price_per_user']] # R2 = 0.501
# independent_variables = df_joined[['event_type_level','price','min_price_per_user','avg_evt_dur_per_user']] # R2 = 0.501
# independent_variables = df_joined[['event_type_level','price','avg_evt_dur_per_user','max_purch_dow_per_user_y']] # R2 = 0.504
# independent_variables = df_joined[['event_type_level','user_event_type_level','effectiveness']] # R2 = 0.505
# independent_variables = df_joined[['event_type_level','user_event_type_level','effectiveness','max_purch_dayofweek']] # R2 = 0.507
# independent_variables = df_joined[['event_type_stage','effectiveness']] # R2 = 0.500
# independent_variables = df_joined[['event_type_stage','event_type_level']] # R2 = 0.505
independent_variables = df_joined['event_type_level'] # R2 = 0.499
dependent_variable = df_joined['purchased_yn']
independent_variables = sm.add_constant(independent_variables)
regression_model = sm.OLS(dependent_variable,independent_variables).fit()
regression_model.summary()


df_joined.to_csv('./ecommerce_analyzed.csv', encoding='utf-8')

	event_time	event_type	product_id	category_id	price	user_id	user_session
13351304	2020-02-07 21:52:23+00:00	purchase	5716861	1487580014042939619	-79.37	602831022	c07648b8-c372-4936-b5ae-78b0f4236af9
4283228	2019-11-06 17:31:51+00:00	purchase	5716861	1487580014042939619	-79.37	554241372	b85719db-2c56-460b-b800-d7a47d4c4d70
19850269	2020-01-26 18:55:52+00:00	purchase	5716861	1487580014042939619	-79.37	606448681	466cd335-b273-49a4-aefb-3077b5fcd87e
17952565	2020-01-13 13:03:15+00:00	purchase	5716861	1487580014042939619	-79.37	489184067	36f0ade4-fe9f-4353-9ddb-a6600e11b314
12938589	2020-02-05 07:57:45+00:00	purchase	5716861	1487580014042939619	-79.37	562535575	d243bb1b-2ee9-4c21-9b2a-6460469e9a75

	event_time	event_type	product_id	category_id	price	user_id	user_session
8780080	2019-10-05 09:58:08+00:00	cart	5603938	1487580007717929935	0.05	556992916	2ff16c5f-d79e-4a44-ac2b-38ede7547a47
16054042	2020-02-27 10:12:14+00:00	purchase	5603934	1487580007717929935	0.05	565745379	5019ff7a-ca64-4098-ae8d-383c76d27500
8003423	2019-11-29 22:50:37+00:00	view	5603938	1487580007717929935	0.05	554216262	213f3248-fe64-4cda-a41b-1ac4f984610c
993262	2019-12-08 19:38:03+00:00	remove_from_cart	5603937	1487580007717929935	0.05	518113830	313f0460-d950-4e63-93e3-852014a95338
16054043	2020-02-27 10:12:14+00:00	purchase	5603937	1487580007717929935	0.05	565745379	5019ff7a-ca64-4098-ae8d-383c76d27500

	event_time	event_type	product_id	category_id	price	user_id	user_session	purchased_yn	total_purchased	session_per_user
0	2019-12-01 00:00:00+00:00	remove_from_cart	5712790	1487580005268456287	6.27	576802932	51d85cb0-897f-48d2-918b-ad63965c12dc	0	64.0	1266.0
1	2019-12-01 00:00:00+00:00	view	5764655	1487580005411062629	29.05	412120092	8adff31e-2051-4894-9758-224bfa8aec18	0	9.0	93.0
2	2019-12-01 00:00:02+00:00	cart	4958	1487580009471148064	1.19	494077766	c99a50e8-2fac-4c4d-89ec-41c05f114554	0	50.0	127.0
3	2019-12-01 00:00:05+00:00	view	5848413	1487580007675986893	0.79	348405118	722ffea5-73c0-4924-8e8f-371ff8031af4	0	NaN	85.0
4	2019-12-01 00:00:07+00:00	view	5824148	1487580005511725929	5.56	576005683	28172809-7e4a-45ce-bab0-5efa90117cd5	0	NaN	57.0
...	...	...	...	...	...	...	...	...	...	...
19497287	2020-01-31 23:59:52+00:00	view	5796984	1487580005671109489	1.73	417102560	dcf90ff3-a246-4b00-b39a-83e6444a0769	0	NaN	26.0
19497288	2020-01-31 23:59:52+00:00	remove_from_cart	5886774	1487580006317032337	1.59	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0
19497289	2020-01-31 23:59:53+00:00	view	5875432	2084144451428549153	2.05	423651741	fb42a963-abef-4c4f-b1ba-f5812dd54e80	0	NaN	85.0
19497290	2020-01-31 23:59:57+00:00	remove_from_cart	5820745	1487580006317032337	2.22	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0
19497291	2020-01-31 23:59:58+00:00	remove_from_cart	5820745	1487580006317032337	2.22	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0

	event_time	event_type	product_id	category_id	price	user_id	user_session	purchased_yn	total_purchased	session_per_user	event_type_level	event_type_stage
0	2019-12-01 00:00:00+00:00	remove_from_cart	5712790	1487580005268456287	6.27	576802932	51d85cb0-897f-48d2-918b-ad63965c12dc	0	64.0	1266.0	3	4
1	2019-12-01 00:00:00+00:00	view	5764655	1487580005411062629	29.05	412120092	8adff31e-2051-4894-9758-224bfa8aec18	0	9.0	93.0	1	4
2	2019-12-01 00:00:02+00:00	cart	4958	1487580009471148064	1.19	494077766	c99a50e8-2fac-4c4d-89ec-41c05f114554	0	50.0	127.0	2	4
3	2019-12-01 00:00:05+00:00	view	5848413	1487580007675986893	0.79	348405118	722ffea5-73c0-4924-8e8f-371ff8031af4	0	NaN	85.0	1	3
4	2019-12-01 00:00:07+00:00	view	5824148	1487580005511725929	5.56	576005683	28172809-7e4a-45ce-bab0-5efa90117cd5	0	NaN	57.0	1	3
...	...	...	...	...	...	...	...	...	...	...	...	...
19497287	2020-01-31 23:59:52+00:00	view	5796984	1487580005671109489	1.73	417102560	dcf90ff3-a246-4b00-b39a-83e6444a0769	0	NaN	26.0	1	2
19497288	2020-01-31 23:59:52+00:00	remove_from_cart	5886774	1487580006317032337	1.59	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0	3	4
19497289	2020-01-31 23:59:53+00:00	view	5875432	2084144451428549153	2.05	423651741	fb42a963-abef-4c4f-b1ba-f5812dd54e80	0	NaN	85.0	1	3
19497290	2020-01-31 23:59:57+00:00	remove_from_cart	5820745	1487580006317032337	2.22	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0	3	4
19497291	2020-01-31 23:59:58+00:00	remove_from_cart	5820745	1487580006317032337	2.22	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0	3	4

	event_time	event_type	product_id	category_id	price	user_id	user_session	purchased_yn	total_purchased	session_per_user	event_type_level	event_type_stage	aet_per_user	avg_evt_dur_per_user
0	2019-12-01 00:00:00+00:00	remove_from_cart	5712790	1487580005268456287	6.27	576802932	51d85cb0-897f-48d2-918b-ad63965c12dc	0	64.0	1266.0	3	4	NaN	28.854893
1	2019-12-01 00:00:00+00:00	view	5764655	1487580005411062629	29.05	412120092	8adff31e-2051-4894-9758-224bfa8aec18	0	9.0	93.0	1	4	NaN	31.261649
2	2019-12-01 00:00:02+00:00	cart	4958	1487580009471148064	1.19	494077766	c99a50e8-2fac-4c4d-89ec-41c05f114554	0	50.0	127.0	2	4	0.0	26.860892
3	2019-12-01 00:00:05+00:00	view	5848413	1487580007675986893	0.79	348405118	722ffea5-73c0-4924-8e8f-371ff8031af4	0	NaN	85.0	1	3	0.0	31.337255
4	2019-12-01 00:00:07+00:00	view	5824148	1487580005511725929	5.56	576005683	28172809-7e4a-45ce-bab0-5efa90117cd5	0	NaN	57.0	1	3	0.0	35.134503
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19497287	2020-01-31 23:59:52+00:00	view	5796984	1487580005671109489	1.73	417102560	dcf90ff3-a246-4b00-b39a-83e6444a0769	0	NaN	26.0	1	2	59.0	30.269231
19497288	2020-01-31 23:59:52+00:00	remove_from_cart	5886774	1487580006317032337	1.59	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0	3	4	59.0	44.251142
19497289	2020-01-31 23:59:53+00:00	view	5875432	2084144451428549153	2.05	423651741	fb42a963-abef-4c4f-b1ba-f5812dd54e80	0	NaN	85.0	1	3	59.0	29.880759
19497290	2020-01-31 23:59:57+00:00	remove_from_cart	5820745	1487580006317032337	2.22	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0	3	4	59.0	44.251142
19497291	2020-01-31 23:59:58+00:00	remove_from_cart	5820745	1487580006317032337	2.22	607092857	a4ccd1c4-a9d2-48d0-9816-082ec5bb5e47	0	11.0	57.0	3	4	59.0	44.251142

Dep. Variable:	purchased_yn	R-squared:	0.499
Model:	OLS	Adj. R-squared:	0.499
Method:	Least Squares	F-statistic:	1.072e+07
Date:	Mon, 14 Mar 2022	Prob (F-statistic):	0.00
Time:	13:23:54	Log-Likelihood:	5.9852e+05
No. Observations:	10748521	AIC:	-1.197e+06
Df Residuals:	10748519	BIC:	-1.197e+06
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-0.3352	0.000	-2159.743	0.000	-0.335	-0.335
event_type_level	0.2235	6.83e-05	3274.883	0.000	0.223	0.224

Omnibus:	322831.910	Durbin-Watson:	1.074
Prob(Omnibus):	0.000	Jarque-Bera (JB):	247703.815
Skew:	0.284	Prob(JB):	0.00
Kurtosis:	2.521	Cond. No.	5.86

Data Analysis of eCommerce User Activity - by Ian Hudson¶

Using 5 months of user activities from view to purchase to see if dependent Y (user likely to purchase or not), can be predicted.¶

20.6 million user events, upon 53,904 products from a Multi-category Online Store (MOS) eCommerce website.¶

Let's get some useful libraries.¶

Creating the Dataframe and converting the event_time column to datetime type¶

Convert the event_time column to datetime type¶

Checking the event_time data type¶

Data Cleansing¶

What does the dataframe look like?¶

Lets remove that unamed:0 column, it's a duplicate of the index column.¶

That column should now be gone, let's confirm¶

Let's see if there are any missing values in columns¶

There are some missing values in category_code, brand and user_session.¶

Let's check out what percentage of data is missing.¶

98% of category_code is missing, 42% of brand is missing, 0.02% of user_session is missing.¶

There is a huge amount of missing data for category_code and brand so I will drop these columns from the dataframe.¶

User_session will remain in the dataframe.¶

Let's remove the category_code column¶

Let's also remove the brand column¶

Let's remove any duplicate row data.¶

Let's remove any leading or trailing spaces from string columns event_type and user_session¶

Now that the data is clean let's do some exploration¶

To see the distribution of data, let's create some Histograms¶

The most common price is the 1 - 15 Euro range. Hmm, there seems to be some non-positive values in the price series, I'll have to check on that later¶

Bar chart Event totals¶

The most frequent events are viewing products then putting products into the cart, removing a product from the cart and lastly making a purchase. Each event drops down to about half of the previous event.¶

Let's see what the percentages are¶

What are the 'Event type' totals?¶

Why are there non-positive values in the 'price' column?¶

We will remove 'product refunds' (the non-positive price values) since they are beyond the scope of this analysis.¶

Checking to see if any still exsist ...¶

Looks like we are ok now.¶

What's the average price for a product?¶

Let's see the unique value counts from select columns¶

What are the unique values for event_type?¶

How many unique Users are there?¶

Creating new variables¶

Let's create our Y dependent column¶

1 for 'purchased' and 0 for everything else¶

Let's get the number of total purchases¶

Let's get the number of purchases per user¶

Let's create a number of sessions per user column¶

Let's get the event levels per user¶

Let's get the average event duration per user in minutes¶

Day of week, most purchases, per user¶

The day that most purchases occur is day 6, Sunday¶

Let's Investigate outliers with Box plots¶

There are many outliers and they will be removed so as not to skew the data.¶

Let's identify our outliers¶

Adjusted - Price Box plot¶

Investigate Correlations¶

Bar plot of Correlated Features¶

Notes:¶

Summary¶

Insights¶

Recommendations¶

Exporting analyzed dataset to CSV file¶