import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn_pandas import DataFrameMapper


#import pre-cleaned modelling data

data = pickle.load(open("C:/Users/Ibiene/OneDrive/DataScience_MachineLearning/Data Science Infinity/Machine Learning/Model Building/data/abc_classification_modelling.p", "rb"))


data.head()


# note x and y values. y is the signup flag.


data.shape

(860, 10)


#check for missing values 
data.isna().sum()

customer_id             0
signup_flag             0
distance_from_store     5
gender                  5
credit_score            8
total_sales             0
total_items             0
transaction_count       0
product_area_count      0
average_basket_value    0
dtype: int64


#check for percentage of missing values
(data.isna().sum()/len(data))*100

customer_id             0.000000
signup_flag             0.000000
distance_from_store     0.581395
gender                  0.581395
credit_score            0.930233
total_sales             0.000000
total_items             0.000000
transaction_count       0.000000
product_area_count      0.000000
average_basket_value    0.000000
dtype: float64


#drop nulls
data = data.dropna()


#drop unnecessary cols
data.drop("customer_id", axis = 1, inplace= True)


#shuffle data
data= shuffle(data, random_state = 23)


#since this is a classification task, check for class balance
data.signup_flag.value_counts(normalize = True)

0    0.695396
1    0.304604
Name: signup_flag, dtype: float64


data.signup_flag.value_counts()

0    589
1    258
Name: signup_flag, dtype: int64


data.describe()


data.plot(kind= 'box', subplots= True, layout = (3, 3), sharex= False, sharey= False, figsize =(16, 10) )
plt.show()


data.hist(figsize =(16, 10))
plt.show()


fig = px.histogram(data, x = "distance_from_store", nbins = 30)
fig.show()


data.columns

Index(['signup_flag', 'distance_from_store', 'gender', 'credit_score',
       'total_sales', 'total_items', 'transaction_count', 'product_area_count',
       'average_basket_value'],
      dtype='object')


#using 5 columns 
outlier_investigation = data.describe()
outlier_columns = ['distance_from_store', 'total_sales', 'total_items', 'transaction_count','average_basket_value']

for column in outlier_columns:
    lower_quartile = data[column].quantile(0.25)
    upper_quartile = data[column].quantile(0.75)
    iqr = upper_quartile - lower_quartile
    iqr_extended = iqr * 2
    min_border = lower_quartile - iqr_extended
    max_border = upper_quartile + iqr_extended
    
    outliers = data[(data[column] < min_border)|(data[column] > max_border)].index
    print(f"{len(outliers)} outliers detected in column {column}")

8 outliers detected in column distance_from_store
54 outliers detected in column total_sales
41 outliers detected in column total_items
19 outliers detected in column transaction_count
34 outliers detected in column average_basket_value


data1 = data.drop(outliers)


data1.shape

(813, 9)


X = data1.drop("signup_flag", axis = 1)


y = data1.signup_flag


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23, stratify = y)


X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 813 entries, 155 to 604
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   distance_from_store   813 non-null    float64
 1   gender                813 non-null    object 
 2   credit_score          813 non-null    float64
 3   total_sales           813 non-null    float64
 4   total_items           813 non-null    int64  
 5   transaction_count     813 non-null    int64  
 6   product_area_count    813 non-null    int64  
 7   average_basket_value  813 non-null    float64
dtypes: float64(4), int64(3), object(1)
memory usage: 57.2+ KB


X.columns

Index(['distance_from_store', 'gender', 'credit_score', 'total_sales',
       'total_items', 'transaction_count', 'product_area_count',
       'average_basket_value'],
      dtype='object')


mapper = DataFrameMapper([
    (['distance_from_store'], StandardScaler()), 
      ('gender', LabelBinarizer()), 
      (['credit_score'], StandardScaler()),
      (['total_sales'],StandardScaler()),
       (['total_items'], StandardScaler()),
      (['transaction_count'],StandardScaler()), 
      (['product_area_count'],StandardScaler()), 
       (['average_basket_value'], StandardScaler())], df_out = True)


mapper.fit(X_train)

DataFrameMapper(df_out=True, drop_cols=[],
                features=[(['distance_from_store'], StandardScaler()),
                          ('gender', LabelBinarizer()),
                          (['credit_score'], StandardScaler()),
                          (['total_sales'], StandardScaler()),
                          (['total_items'], StandardScaler()),
                          (['transaction_count'], StandardScaler()),
                          (['product_area_count'], StandardScaler()),
                          (['average_basket_value'], StandardScaler())])


Z_train = mapper.transform(X_train)
Z_test = mapper.transform(X_test)


Z_train.sample(3)


#RFECV

clf = LogisticRegression(random_state = 43, max_iter = 1000)
feature_selector = RFECV(clf)

fit = feature_selector.fit(Z_train, y_train)

optimal_feature_count = feature_selector.n_features_
print(f"Optimal no of features is {optimal_feature_count}")

Optimal no of features is 5


feature_selector.get_support()

array([ True,  True, False, False, False,  True,  True,  True])


#limit train and test data to only include selected variables 
Z_train = Z_train.loc[:, feature_selector.get_support()]
Z_test = Z_test.loc[:, feature_selector.get_support()]


plt.style.use('seaborn-poster')
plt.plot(range(1, len(fit.cv_results_['mean_test_score']) + 1), fit.cv_results_['mean_test_score'], marker = "o")
plt.ylabel("Classification Accuracy")
plt.xlabel("Number of Features")
plt.title(f"Feature Selection using RFECV \n Optimal number of features is {optimal_feature_count} (at score of {round(max(fit.cv_results_['mean_test_score']),4)})")
plt.tight_layout()
plt.show()


clf = LogisticRegression(random_state = 42, max_iter = 1000)
clf.fit(Z_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)


y_pred_class= clf.predict(Z_test)
y_pred_prob = clf.predict_proba(Z_test)[:, 1] 
accuracy_score(y_test, y_pred_class)

0.8957055214723927


conf_matrix = confusion_matrix(y_test, y_pred_class)
conf_matrix

array([[112,   6],
       [ 11,  34]], dtype=int64)


plt.style.use("seaborn-poster")
plt.matshow(conf_matrix, cmap = "coolwarm")
plt.gca().xaxis.tick_bottom()
plt.title("Confusion Matrix")
plt.ylabel("Actual Class")
plt.xlabel("Predicted Class")
for (i, j), corr_value in np.ndenumerate(conf_matrix):
    plt.text(j, i, corr_value, ha = "center", va = "center", fontsize = 20)
plt.show()


# classification accuracy
accuracy_lr = round(accuracy_score(y_test, y_pred_class)*100, 2)
print(f"Accuracy score is {accuracy_lr}, meaning the model coorectly predicts {accuracy_lr}% of the test set observations")

# precision
precision_lr = round(precision_score(y_test, y_pred_class)*100, 2)
print(f"Precision score is {precision_lr}, meaning for our predicted delivery club signups, the model was correct {precision_lr}% of the time")

# recall
recall_lr = round(recall_score(y_test, y_pred_class)*100, 2)
print(f"Recall score is {recall_lr}, meaning of allactual delivery club signups, the model correctly predicts {recall_lr}% of the time")

# f1-score
f1_lr = round(f1_score(y_test, y_pred_class), 2)
print(f"F1 score is {f1_lr}")

Accuracy score is 89.57, meaning the model coorectly predicts 89.57% of the test set observations
Precision score is 85.0, meaning for our predicted delivery club signups, the model was correct 85.0% of the time
Recall score is 75.56, meaning of allactual delivery club signups, the model correctly predicts 75.56% of the time
F1 score is 0.8


#finding the optimal classification threshold 

# set up the list of thresholds to loop through
thresholds = np.arange(0, 1, 0.01)

# create empty lists to append the results to
precision_scores = []
recall_scores = []
f1_scores = []

# loop through each threshold - fit the model - append the results
for threshold in thresholds:
    
    pred_class = (y_pred_prob >= threshold) * 1
    
    precision = precision_score(y_test, pred_class, zero_division = 0)
    precision_scores.append(precision)
    
    recall = recall_score(y_test, pred_class)
    recall_scores.append(recall)
    
    f1 = f1_score(y_test, pred_class)
    f1_scores.append(f1)
    
# extract the optimal f1-score (and it's index)
max_f1 = max(f1_scores)
max_f1_idx = f1_scores.index(max_f1)



# plot the results
plt.style.use("seaborn-poster")
plt.plot(thresholds, precision_scores, label = "Precision", linestyle = "--")
plt.plot(thresholds, recall_scores, label = "Recall", linestyle = "--")
plt.plot(thresholds, f1_scores, label = "F1", linewidth = 5)
plt.title(f"Finding the Optimal Threshold for Classification Model \n Max F1: {round(max_f1,2)} (Threshold = {round(thresholds[max_f1_idx],2)})")
plt.xlabel("Threshold")
plt.ylabel("Assessment Score")
plt.legend(loc = "lower left")
plt.tight_layout()
plt.show()


mapper_dt = DataFrameMapper([
    (['distance_from_store'], None), 
      ('gender', LabelBinarizer()), 
      (['credit_score'], None),
      (['total_sales'],None),
       (['total_items'], None),
      (['transaction_count'],None), 
      (['product_area_count'],None), 
       (['average_basket_value'], None)], df_out = True)


mapper_dt.fit(X_train)

DataFrameMapper(df_out=True, drop_cols=[],
                features=[(['distance_from_store'], None),
                          ('gender', LabelBinarizer()),
                          (['credit_score'], None), (['total_sales'], None),
                          (['total_items'], None),
                          (['transaction_count'], None),
                          (['product_area_count'], None),
                          (['average_basket_value'], None)])


Z_train_dt = mapper_dt.transform (X_train)
Z_test_dt = mapper_dt.transform(X_test)


Z_train_dt.sample(2)


Z_test_dt.sample(2)


dt = DecisionTreeClassifier(random_state = 42, max_depth = 5)


dt.fit(Z_train_dt, y_train)

DecisionTreeClassifier(max_depth=5, random_state=42)


y_pred_class_dt = dt.predict(Z_test_dt)


y_pred_prob_dt = dt.predict_proba(Z_test_dt)[:, 1]


# create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_class_dt)

# plot the confusion matrix
plt.style.use("seaborn-poster")
plt.matshow(conf_matrix, cmap = "coolwarm")
plt.gca().xaxis.tick_bottom()
plt.title("Confusion Matrix")
plt.ylabel("Actual Class")
plt.xlabel("Predicted Class")
for (i, j), corr_value in np.ndenumerate(conf_matrix):
    plt.text(j, i, corr_value, ha = "center", va = "center", fontsize = 20)
plt.show()


accuracy_dt = round(accuracy_score(y_test, y_pred_class_dt)*100, 2)
print(f"Accuracy score is {accuracy_dt}, meaning the model coorectly predicts {accuracy_dt}% of the test set observations")

# precision
precision_dt =  round(precision_score(y_test, y_pred_class_dt)*100, 2)
print(f"Precision score is {precision_dt}, meaning for our predicted delivery club signups, the model was correct {precision_dt}% of the time")

# recall
recall_dt= round(recall_score(y_test, y_pred_class_dt)*100, 2)
print(f"Recall score is {recall_dt}, meaning of allactual delivery club signups, the model correctly predicts {recall_dt}% of the time")

# f1-score
f1_dt = round(f1_score(y_test, y_pred_class_dt), 2)
print(f"F1 score is {f1_dt}")

Accuracy score is 96.93, meaning the model coorectly predicts 96.93% of the test set observations
Precision score is 93.48, meaning for our predicted delivery club signups, the model was correct 93.48% of the time
Recall score is 95.56, meaning of allactual delivery club signups, the model correctly predicts 95.56% of the time
F1 score is 0.95


plt.figure(figsize=(25,15))
tree = plot_tree(dt,
                 feature_names = X.columns,
                 filled = True,
                 rounded = True,
                 fontsize = 16)


#finding the best max depth to prevent overfitting 

max_depth_list = list(range(1,20))
accuracy_scores = []

# loop through each possible depth, train and validate model, append test set f1-score
for depth in max_depth_list:
    
    dt = DecisionTreeClassifier(max_depth = depth, random_state = 42)
    dt.fit(Z_train_dt,y_train)
    y_pred_dt = dt.predict(Z_test_dt)
    accuracy = f1_score(y_test,y_pred_dt)
    accuracy_scores.append(accuracy)
    
# store max accuracy, and optimal depth    
max_accuracy = max(accuracy_scores)
max_accuracy_idx = accuracy_scores.index(max_accuracy)
optimal_depth = max_depth_list[max_accuracy_idx]

df = pd.DataFrame({'max_depth_list': max_depth_list, 'accuracy_scores': accuracy_scores})
                   
# plot accuracy by max depth
px.line(df, x = "max_depth_list", y = "accuracy_scores", 
        title = f"Accuracy (F1 score) by Max Depth \n Optimal Tree Depth: {optimal_depth} (F1 Score: {round(max_accuracy, 4)})")


from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance


mapper_rf = DataFrameMapper([
    (['distance_from_store'], None), 
      ('gender', LabelBinarizer()), 
      (['credit_score'], None),
      (['total_sales'],None),
       (['total_items'], None),
      (['transaction_count'],None), 
      (['product_area_count'],None), 
       (['average_basket_value'], None)], df_out = True)


mapper_rf.fit(X_train)

DataFrameMapper(df_out=True, drop_cols=[],
                features=[(['distance_from_store'], None),
                          ('gender', LabelBinarizer()),
                          (['credit_score'], None), (['total_sales'], None),
                          (['total_items'], None),
                          (['transaction_count'], None),
                          (['product_area_count'], None),
                          (['average_basket_value'], None)])


Z_train_rf = mapper_dt.transform (X_train)
Z_test_rf= mapper_dt.transform(X_test)


Z_train_rf.sample(3)


Z_test_rf.sample(3)


rf = RandomForestClassifier(random_state = 23, n_estimators = 500, max_features = 5)


rf.fit(Z_train_rf, y_train)

RandomForestClassifier(max_features=5, n_estimators=500, random_state=23)


y_pred_class_rf = rf.predict(Z_test_rf)
y_pred_prob_rf = rf.predict_proba(Z_test_rf)[:,1]


# create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_class_rf)

# plot the confusion matrix
plt.style.use("seaborn-poster")
plt.matshow(conf_matrix, cmap = "coolwarm")
plt.gca().xaxis.tick_bottom()
plt.title("Confusion Matrix")
plt.ylabel("Actual Class")
plt.xlabel("Predicted Class")
for (i, j), corr_value in np.ndenumerate(conf_matrix):
    plt.text(j, i, corr_value, ha = "center", va = "center", fontsize = 20)
plt.show()


accuracy_rf = round(accuracy_score(y_test, y_pred_class_rf)*100, 2)
print(f"Accuracy score is {accuracy_rf}, meaning the model correctly predicts {accuracy_rf}% of the test set observations")

# precision
precision_rf = round(precision_score(y_test, y_pred_class_rf)*100, 2)
print(f"Precision score is {precision_rf}, meaning for our predicted delivery club signups, the model was correct {precision_rf}% of the time")

# recall
recall_rf = round(recall_score(y_test, y_pred_class_rf)*100, 2)
print(f"Recall score is {recall_rf}, meaning of allactual delivery club signups, the model correctly predicts {recall_rf}% of the time")

# f1-score
f1_rf = round(f1_score(y_test, y_pred_class_rf), 2)
print(f"F1 score is {f1_rf}")

Accuracy score is 95.71, meaning the model correctly predicts 95.71% of the test set observations
Precision score is 93.18, meaning for our predicted delivery club signups, the model was correct 93.18% of the time
Recall score is 91.11, meaning of allactual delivery club signups, the model correctly predicts 91.11% of the time
F1 score is 0.92


rf.feature_importances_

array([0.44067598, 0.07682073, 0.02565538, 0.11114866, 0.04675597,
       0.06290855, 0.15752548, 0.07850924])


feature_importance = rf.feature_importances_
feature_names = X.columns
feature_importance_summary = pd.DataFrame({'Feature Names': feature_names, 'Feature Importance': feature_importance})
feature_importance_summary.sort_values(by = "Feature Importance", inplace = True)
print(feature_importance_summary)


# plot feature importance
plt.barh(feature_importance_summary["Feature Names"],feature_importance_summary["Feature Importance"])
plt.title("Feature Importance of Random Forest")
plt.xlabel("Feature Importance")
plt.tight_layout()
plt.show()

          Feature Names  Feature Importance
2          credit_score            0.025655
4           total_items            0.046756
5     transaction_count            0.062909
1                gender            0.076821
7  average_basket_value            0.078509
3           total_sales            0.111149
6    product_area_count            0.157525
0   distance_from_store            0.440676


# calculate permutation importance
result = permutation_importance(rf, Z_test_rf, y_test, n_repeats = 10, random_state = 23)
permutation_importance = pd.DataFrame(result["importances_mean"])
feature_names = pd.DataFrame(X.columns)
permutation_importance_summary = pd.concat([feature_names,permutation_importance], axis = 1)
permutation_importance_summary.columns = ["input_variable","permutation_importance"]
permutation_importance_summary.sort_values(by = "permutation_importance", inplace = True)

# plot permutation importance
plt.barh(permutation_importance_summary["input_variable"],permutation_importance_summary["permutation_importance"])
plt.title("Permutation Importance of Random Forest")
plt.xlabel("Permutation Importance")
plt.tight_layout()
plt.show()


from sklearn.neighbors import KNeighborsClassifier


mapper = DataFrameMapper([
    (['distance_from_store'], MinMaxScaler()), 
      ('gender', LabelBinarizer()), 
      (['credit_score'], MinMaxScaler()),
      (['total_sales'],MinMaxScaler()),
       (['total_items'], MinMaxScaler()),
      (['transaction_count'],MinMaxScaler()), 
      (['product_area_count'],MinMaxScaler()), 
       (['average_basket_value'], MinMaxScaler())], df_out = True)


mapper.fit(X_train)

DataFrameMapper(df_out=True, drop_cols=[],
                features=[(['distance_from_store'], MinMaxScaler()),
                          ('gender', LabelBinarizer()),
                          (['credit_score'], MinMaxScaler()),
                          (['total_sales'], MinMaxScaler()),
                          (['total_items'], MinMaxScaler()),
                          (['transaction_count'], MinMaxScaler()),
                          (['product_area_count'], MinMaxScaler()),
                          (['average_basket_value'], MinMaxScaler())])


Z_train_knn = mapper.transform(X_train)
Z_test_knn = mapper.transform(X_test)


Z_train_knn.sample(3)


#using RFECV

rfc = RandomForestClassifier(random_state = 23)
feature_selector1 = RFECV(rfc)

fit = feature_selector1.fit(Z_train_knn, y_train)


optimal_feature_count = feature_selector1.n_features_
print(f"Optimal no of features: {optimal_feature_count}")

Optimal no of features: 4


support_results = feature_selector1.get_support()


input = Z_train_knn.columns


optimal_feature_summary = pd.DataFrame({
    'input': input, 
    'support_results': support_results
})
optimal_feature_summary


#limit train and test set to include only selected variables
Z_train_knn = Z_train_knn.loc[:, feature_selector1.get_support()]
Z_test_knn = Z_test_knn.loc[:, feature_selector1.get_support()]


plt.style.use('seaborn-poster')
plt.plot(range(1, len(fit.cv_results_['mean_test_score']) + 1), fit.cv_results_['mean_test_score'], marker = "o")
plt.ylabel("Classification Accuracy")
plt.xlabel("Number of Features")
plt.title(f"Feature Selection using RFECV \n Optimal number of features is {optimal_feature_count} (at score of {round(max(fit.cv_results_['mean_test_score']),4)})")
plt.tight_layout()
plt.show()


knn = KNeighborsClassifier()
knn.fit(Z_train_knn, y_train)

KNeighborsClassifier()


# predict on the test set
y_pred_class_knn = knn.predict(Z_test_knn)
y_pred_prob_knn = knn.predict_proba(Z_test_knn)[:,1]


# create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_class_knn)

# plot the confusion matrix
plt.style.use("seaborn-poster")
plt.matshow(conf_matrix, cmap = "coolwarm")
plt.gca().xaxis.tick_bottom()
plt.title("Confusion Matrix")
plt.ylabel("Actual Class")
plt.xlabel("Predicted Class")
for (i, j), corr_value in np.ndenumerate(conf_matrix):
    plt.text(j, i, corr_value, ha = "center", va = "center", fontsize = 20)
plt.show()


accuracy_knn = round(accuracy_score(y_test, y_pred_class_knn)*100, 2)
print(f"Accuracy score is {accuracy_knn}, meaning the model coorectly predicts {accuracy_knn} % of the test set observations")

# precision
precision_knn = round(precision_score(y_test, y_pred_class_knn)*100, 2)
print(f"Precision score is {precision_knn}, meaning for our predicted delivery club signups, the model was correct {precision_knn}% of the time")

# recall
recall_knn = round(recall_score(y_test, y_pred_class_knn)*100, 2)
print(f"Recall score is {recall_rf}, meaning of allactual delivery club signups, the model correctly predicts {recall_knn}% of the time")

# f1-score
f1_knn = round(f1_score(y_test, y_pred_class_knn), 2)
print(f"F1 score is {f1_knn}")

Accuracy score is 81.6, meaning the model coorectly predicts 81.6 % of the test set observations
Precision score is 69.23, meaning for our predicted delivery club signups, the model was correct 69.23% of the time
Recall score is 91.11, meaning of allactual delivery club signups, the model correctly predicts 60.0% of the time
F1 score is 0.64


# set up range for search, and empty list to append accuracy scores to
k_list = list(range(2,25))
accuracy_scores = []

# loop through each possible value of k, train and validate model, append test set f1-score
for k in k_list:
    
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(Z_train_knn,y_train)
    y_pred = knn.predict(Z_test_knn)
    accuracy = f1_score(y_test,y_pred)
    accuracy_scores.append(accuracy)
    
# store max accuracy, and optimal k value    
max_accuracy = max(accuracy_scores)
max_accuracy_idx = accuracy_scores.index(max_accuracy)
optimal_k_value = k_list[max_accuracy_idx]

# plot accuracy by max depth
plt.plot(k_list,accuracy_scores)
plt.scatter(optimal_k_value, max_accuracy, marker = "x", color = "red")
plt.title(f"Accuracy (F1 Score) by k \n Optimal Value for k: {optimal_k_value} (Accuracy: {round(max_accuracy,4)})")
plt.xlabel("k")
plt.ylabel("Accuracy (F1 Score)")
plt.tight_layout()
plt.show()


models =["Logistic Regression", "Decision Tree", "Random Forest", "K Nearest Neighbors"]
accuracy_list = [accuracy_lr, accuracy_dt, accuracy_rf, accuracy_knn]
precision_list = [precision_lr, precision_dt, precision_rf, precision_knn]
recall_list=[recall_lr, recall_dt, recall_rf, recall_knn]
f1_list = [f1_lr, f1_dt, f1_rf, f1_knn]


model_summary = pd.DataFrame({
    'Models': models, 
    'Accuracy': accuracy_list, 
    'Precision': precision_list, 
    'Recall': recall_list, 
    'F1-score': f1_list, 
})
model_summary


#save dataframe to a .png file

import dataframe_image as dfi


dfi.export(model_summary, 'model_summary.png')

Variable Name	Variable Type	Description
signup_flag	Dependent	A binary variable showing if the customer signed up for the delivery club in the last campaign
distance_from_store	Independent	The distance in miles from the customers home address, and the store
gender	Independent	The gender provided by the customer
credit_score	Independent	The customers most recent credit score
total_sales	Independent	Total spend by the customer in ABC Grocery - 3 months pre campaign
total_items	Independent	Total products purchased by the customer in ABC Grocery - 3 months pre campaign
transaction_count	Independent	Total unique transactions made by the customer in ABC Grocery - 3 months pre campaign
product_area_count	Independent	The number of product areas within ABC Grocery the customers has shopped into - 3 months pre campaign
average_basket_value	Independent	The average spend per transaction for the customer in ABC Grocery - 3 months pre campaign

	customer_id	signup_flag	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
0	74	1	3.38	F	0.59	1586.89	195	26	5	61.034231
1	524	1	4.76	F	0.52	2397.26	258	27	5	88.787407
2	607	1	4.45	F	0.49	1279.91	183	22	5	58.177727
3	343	0	0.91	M	0.54	967.14	102	17	5	56.890588
4	322	1	3.02	F	0.63	1566.35	182	30	5	52.211667

	signup_flag	distance_from_store	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
count	847.000000	847.000000	847.000000	847.000000	847.000000	847.000000	847.000000	847.000000
mean	0.304604	2.614545	0.597521	968.166411	143.877214	22.214876	4.177096	38.034161
std	0.460512	14.397590	0.102264	1073.647531	125.342694	11.721699	0.920887	24.243691
min	0.000000	0.000000	0.260000	2.090000	1.000000	1.000000	1.000000	2.090000
25%	0.000000	0.730000	0.530000	383.940000	77.000000	16.000000	4.000000	21.734700
50%	0.000000	1.640000	0.590000	691.640000	123.000000	23.000000	4.000000	31.069333
75%	1.000000	2.920000	0.670000	1121.530000	170.500000	28.000000	5.000000	46.429973
max	1.000000	400.970000	0.880000	7372.060000	910.000000	75.000000	5.000000	141.054091

	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
47	0.189698	0	0.496855	0.220389	-0.162012	0.153916	0.940073	0.388604
749	0.051140	1	-1.368807	-1.120527	-1.443076	-1.906945	-3.303311	-1.296058
234	-0.394844	1	-1.172422	-0.100886	-0.490781	0.153916	0.940073	-0.137283

	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
479	0.41	1	0.66	557.83	134	26	4	21.455000
356	1.06	1	0.83	888.69	146	27	3	32.914444
596	1.53	0	0.53	506.40	82	24	5	21.100000

Enhancing Target Accuracy using Machine Learning¶

Table of contents¶

Context ¶

Actions ¶

Growth/Next Steps ¶

Data Overview ¶

Data Import, EDA and Preprocessing ¶

Logistic Regression ¶

Decision Trees ¶

Random Forest ¶

K Nearest Neighbours ¶

XGB ¶

LGB ¶

Modelling Summary¶

	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
330	0.4	1	0.46	573.11	135	25	4	22.924400
401	0.7	0	0.57	356.82	94	27	4	13.215556

	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
64	0.8	1	0.80	950.11	186	27	5	35.189259
366	3.7	0	0.67	1230.42	170	16	3	76.901250

	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
58	2.18	0	0.69	440.45	78	13	5	33.880769
809	1.51	1	0.55	3162.69	440	52	5	60.820962
410	0.86	0	0.75	536.86	120	32	4	16.776875

	distance_from_store	gender	credit_score	total_sales	total_items	transaction_count	product_area_count	average_basket_value
528	0.000615	1	0.590164	0.117437	0.119537	0.338235	1.00	0.310045
627	0.019053	1	0.409836	0.016127	0.033419	0.088235	0.75	0.136828
27	0.033629	0	0.737705	0.289477	0.277635	0.323529	1.00	0.831456

	input	support_results
0	distance_from_store	True
1	gender	False
2	credit_score	False
3	total_sales	True
4	total_items	False
5	transaction_count	False
6	product_area_count	True
7	average_basket_value	True

	Models	Accuracy	Precision	Recall	F1-score
0	Logistic Regression	89.57	85.00	75.56	0.80
1	Decision Tree	96.93	93.48	95.56	0.95
2	Random Forest	95.71	93.18	91.11	0.92
3	K Nearest Neighbors	81.60	69.23	60.00	0.64

Enhancing Target Accuracy using Machine Learning¶

Table of contents¶

Context ¶

Actions ¶

Growth/Next Steps ¶

Data Overview ¶

Data Import, EDA and Preprocessing ¶

Logistic Regression ¶

Decision Trees ¶

Random Forest ¶

K Nearest Neighbours ¶

XGB ¶

LGB¶

Modelling Summary¶

LGB ¶