Load Libraries

In [2]:
import pandas as pd
import seaborn as srn
import statistics  as sts
import matplotlib.pyplot as plt
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, accuracy_score, confusion_matrix
from graphviz import Digraph
from IPython.display import Image

Dataset Upload

In [3]:
url = 'https://docs.google.com/spreadsheets/d/1M89WOPX-krEjzRs7zvjTxZqF9EUuxXW0vjXARdn-e4w/gviz/tq?tqx=out:csv'
In [4]:
dataset = pd.read_csv(url)
In [5]:
dataset.head()
Out[5]:
Product Clicks Impressions CPC Conversions Product Price IQ
0 Product 1 348.0 4045 $1.50 11 49 6.0
1 Product 2 271.0 2205 $1.40 16 60 7.0
2 Product 3 767.0 10285 $1.12 40 67 7.0
3 Product 4 399.0 5555 $1.12 19 56 6.0
4 Product 5 537.0 7777 $1.12 28 61 7.0

Data Exploration

In [6]:
# Size
dataset.shape
Out[6]:
(500, 7)
In [7]:
# Data Type
print(dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Product        500 non-null    object 
 1   Clicks         499 non-null    float64
 2   Impressions    500 non-null    int64  
 3   CPC            499 non-null    object 
 4   Conversions    500 non-null    int64  
 5   Product Price  500 non-null    int64  
 6   IQ             499 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 27.5+ KB
None
In [8]:
# Statistical Description
dataset.describe()
Out[8]:
Clicks Impressions Conversions Product Price IQ
count 499.000000 500.000000 500.000000 500.000000 499.000000
mean 589.042084 7073.748000 28.224000 43.444000 7.254509
std 233.816528 3655.743075 11.949368 15.074925 1.260042
min 202.000000 2004.000000 8.000000 11.000000 5.000000
25% 376.500000 4254.750000 18.000000 30.000000 7.000000
50% 583.000000 6358.000000 28.000000 43.000000 7.000000
75% 780.000000 8835.000000 36.250000 56.000000 8.000000
max 1000.000000 18411.000000 58.000000 70.000000 10.000000

Data Cleaning

In [9]:
# Removal of Irrelevant Columns
dataset = dataset.drop(['Product'], axis=1)
In [10]:
# Rename Columns
dataset = dataset.rename(columns={'IQ': 'Quality Score'})
In [11]:
dataset.head()
Out[11]:
Clicks Impressions CPC Conversions Product Price Quality Score
0 348.0 4045 $1.50 11 49 6.0
1 271.0 2205 $1.40 16 60 7.0
2 767.0 10285 $1.12 40 67 7.0
3 399.0 5555 $1.12 19 56 6.0
4 537.0 7777 $1.12 28 61 7.0
In [12]:
# Removal of Symbols ($, %)
dataset['CPC'] = dataset['CPC'].str.replace('[^\d.]', '', regex=True)
print(dataset['CPC'])
0      1.50
1      1.40
2      1.12
3      1.12
4      1.12
       ... 
495    1.13
496    1.13
497    1.12
498    1.12
499    1.12
Name: CPC, Length: 500, dtype: object
In [13]:
# Check NAN
dataset.isnull().sum()
Out[13]:
0
Clicks 1
Impressions 0
CPC 1
Conversions 0
Product Price 0
Quality Score 1

In [14]:
# Replacement of Nulls with Median Values
mediana_clicks = dataset['Clicks'].median()
dataset['Clicks'].fillna(mediana_clicks, inplace=True)
mediana_cpc = dataset['CPC'].median()
dataset['CPC'].fillna(mediana_cpc, inplace=True)
mediana_qs = dataset['Quality Score'].median()
dataset['Quality Score'].fillna(mediana_qs, inplace=True)
In [15]:
dataset.isnull().sum()
Out[15]:
0
Clicks 0
Impressions 0
CPC 0
Conversions 0
Product Price 0
Quality Score 0

In [16]:
# Checking for duplicates
num_dup = dataset.duplicated().sum()
print(num_dup)
0

Data Transformation

In [17]:
print(dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Clicks         500 non-null    float64
 1   Impressions    500 non-null    int64  
 2   CPC            500 non-null    object 
 3   Conversions    500 non-null    int64  
 4   Product Price  500 non-null    int64  
 5   Quality Score  500 non-null    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 23.6+ KB
None
In [18]:
# Converting Object to Numeric
dataset['CPC'] = pd.to_numeric(dataset['CPC'], errors='coerce')
print(dataset['CPC'])
0      1.50
1      1.40
2      1.12
3      1.12
4      1.12
       ... 
495    1.13
496    1.13
497    1.12
498    1.12
499    1.12
Name: CPC, Length: 500, dtype: float64
In [19]:
print(dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Clicks         500 non-null    float64
 1   Impressions    500 non-null    int64  
 2   CPC            500 non-null    float64
 3   Conversions    500 non-null    int64  
 4   Product Price  500 non-null    int64  
 5   Quality Score  500 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 23.6 KB
None

Feature Engineering

In [20]:
dataset['Spend'] = dataset['CPC'] * dataset['Clicks']
In [21]:
dataset['CTR'] = (dataset['Clicks'] / dataset['Impressions']) * 100
In [22]:
dataset['CPA'] = dataset['Spend'] / dataset['Conversions']
In [23]:
dataset['CVR'] = (dataset['Conversions'] / dataset['Clicks']) * 100
In [24]:
dataset['Revenue'] = dataset['Product Price'] * dataset['Conversions']
In [25]:
dataset['ROAS'] = dataset['Revenue'] / dataset['Spend']
In [26]:
dataset['ROAS >= 3'] = dataset['ROAS'].apply(lambda x: 'Yes' if x >= 3 else 'No')
In [27]:
dataset['ROAS >= 3'] = dataset['ROAS >= 3'].map({'Yes': 1, 'No': 0})
In [28]:
print(dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Clicks         500 non-null    float64
 1   Impressions    500 non-null    int64  
 2   CPC            500 non-null    float64
 3   Conversions    500 non-null    int64  
 4   Product Price  500 non-null    int64  
 5   Quality Score  500 non-null    float64
 6   Spend          500 non-null    float64
 7   CTR            500 non-null    float64
 8   CPA            500 non-null    float64
 9   CVR            500 non-null    float64
 10  Revenue        500 non-null    int64  
 11  ROAS           500 non-null    float64
 12  ROAS >= 3      500 non-null    int64  
dtypes: float64(8), int64(5)
memory usage: 50.9 KB
None
In [29]:
dataset.head()
Out[29]:
Clicks Impressions CPC Conversions Product Price Quality Score Spend CTR CPA CVR Revenue ROAS ROAS >= 3
0 348.0 4045 1.50 11 49 6.0 522.00 8.603214 47.454545 3.160920 539 1.032567 0
1 271.0 2205 1.40 16 60 7.0 379.40 12.290249 23.712500 5.904059 960 2.530311 0
2 767.0 10285 1.12 40 67 7.0 859.04 7.457462 21.476000 5.215124 2680 3.119762 1
3 399.0 5555 1.12 19 56 6.0 446.88 7.182718 23.520000 4.761905 1064 2.380952 0
4 537.0 7777 1.12 28 61 7.0 601.44 6.904976 21.480000 5.214153 1708 2.839851 0

Initial Data Visualization

In [30]:
# Statistical Description
dataset.describe()
Out[30]:
Clicks Impressions CPC Conversions Product Price Quality Score Spend CTR CPA CVR Revenue ROAS ROAS >= 3
count 500.000000 500.000000 500.00000 500.000000 500.000000 500.00000 500.00000 500.000000 500.000000 500.000000 500.000000 500.000000 500.000000
mean 589.030000 7073.748000 1.35888 28.224000 43.444000 7.25400 879.54872 8.967675 29.489859 4.849527 1226.268000 2.663410 0.332000
std 233.582282 3655.743075 11.14481 11.949368 15.074925 1.25883 8344.36570 2.392384 245.252412 0.893427 694.493853 1.346705 0.471403
min 202.000000 2004.000000 0.44000 8.000000 11.000000 5.00000 112.50000 5.007481 8.184000 1.372213 165.000000 0.010182 0.000000
25% 376.750000 4254.750000 0.64000 18.000000 30.000000 7.00000 306.47750 7.301359 13.450161 4.320988 680.000000 1.641455 0.000000
50% 583.000000 6358.000000 0.82000 28.000000 43.000000 7.00000 448.60000 8.859212 17.278365 4.920574 1060.000000 2.438319 0.000000
75% 779.500000 8835.000000 1.10000 36.250000 56.000000 8.00000 648.27000 10.610200 21.940672 5.645761 1651.500000 3.380734 1.000000
max 1000.000000 18411.000000 250.00000 58.000000 70.000000 10.00000 187000.00000 14.956737 5500.000000 6.052270 3710.000000 7.445590 1.000000
In [31]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Clicks'], ax=ax[0])
ax[0].set_title('Boxplot')
srn.histplot(dataset['Clicks'], bins=5, kde=True, ax=ax[1])
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [32]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Impressions'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Impressions'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [33]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPC'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPC'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [34]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Conversions'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Conversions'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [35]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Product Price'], ax=ax[0],color='pink')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Product Price'], bins=5, kde=True, ax=ax[1],color='pink')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [36]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Quality Score'], ax=ax[0],color='gray')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Quality Score'], bins=5, kde=True, ax=ax[1],color='gray')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [37]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Spend'], ax=ax[0],color='blue')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Spend'], bins=5, kde=True, ax=ax[1],color='blue')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [38]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CTR'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CTR'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [39]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPA'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPA'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [40]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CVR'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CVR'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [41]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Revenue'], ax=ax[0],color='gray')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Revenue'], bins=5, kde=True, ax=ax[1],color='gray')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
In [42]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['ROAS'], ax=ax[0],color='pink')
ax[0].set_title('Boxplot')
srn.histplot(dataset['ROAS'], bins=5, kde=True, ax=ax[1],color='pink')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()

Data Treatment

In [43]:
Q1 = dataset['CPC'].quantile(0.25)
Q3 = dataset['CPC'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_cpc = dataset[(dataset['CPC'] < limite_inferior) | (dataset['CPC'] > limite_superior)]
print(out_cpc)
     Clicks  Impressions    CPC  Conversions  Product Price  Quality Score  \
266   748.0        10241  250.0           34             56            6.0   

        Spend       CTR     CPA       CVR  Revenue      ROAS  ROAS >= 3  
266  187000.0  7.303974  5500.0  4.545455     1904  0.010182          0  
In [44]:
# Removing Outliers
dataset = dataset[(dataset['CPC'] >= limite_inferior) & (dataset['CPC'] <= limite_superior)]
dataset.shape
Out[44]:
(499, 13)
In [45]:
srn.boxplot(x=dataset['CPC'])
Out[45]:
<Axes: xlabel='CPC'>

Data Visualization

In [46]:
# Scatterplot Clicks vs Impressions
srn.scatterplot(data=dataset, x='Clicks', y='Impressions')
plt.title('Clicks vs Impressions',fontsize=16)
plt.show()
In [47]:
# Scatterplot Clicks vs Conversions
srn.scatterplot(data=dataset, x='Clicks', y='Conversions', color='green')
plt.title('Clicks vs Conversions',fontsize=16)
plt.show()
In [48]:
# Scatterplot CPC vs Conversions
srn.scatterplot(data=dataset, x='CPC', y='Conversions', color='red')
plt.title('CPC vs Conversions',fontsize=16)
plt.show()
In [49]:
# Line Graph CPC vs Clicks
srn.lineplot(data=dataset, x='CPC', y='Clicks', marker='o', color='blue')
Out[49]:
<Axes: xlabel='CPC', ylabel='Clicks'>
In [50]:
# Line Graph CPC vs Conversions
srn.lineplot(data=dataset, x='CPC', y='Conversions', marker='o', color='violet')
Out[50]:
<Axes: xlabel='CPC', ylabel='Conversions'>
In [51]:
# Line Graph Clicks vs Impressions
srn.lineplot(data=dataset, x='Clicks', y='Impressions', marker='o', color='green')
Out[51]:
<Axes: xlabel='Clicks', ylabel='Impressions'>
In [52]:
# Line Graph Clicks vs Conversions
srn.lineplot(data=dataset, x='Clicks', y='Conversions', marker='o', color='brown')
Out[52]:
<Axes: xlabel='Clicks', ylabel='Conversions'>
In [53]:
# Define CPC Interval
cpc_bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
dataset['CPCbin'] = pd.cut(dataset['CPC'], bins=cpc_bins)

# Average of Clicks per CPC Interval
mean_clicks = dataset.groupby('CPCbin', observed=False)['Clicks'].mean().sort_values()
mean_clicks.plot(kind='barh', color='purple')
plt.title('Average of Clicks per CPC Interval',fontsize=16)
plt.xlabel('Average of Clicks')
plt.ylabel('Average of CPC')
plt.show()
In [54]:
# Average of Conversions per CPC Interval
cpc_bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
dataset['CPCbin'] = pd.cut(dataset['CPC'], bins=cpc_bins)

# Média de cliques por intervalo de CPC com observed=False
mean_conversions = dataset.groupby('CPCbin', observed=False)['Conversions'].mean().sort_values()
mean_conversions.plot(kind='barh',color='green')
plt.title('Average of Conversions per CPC Interval',fontsize=16)
plt.xlabel('Average of Conversions')
plt.ylabel('Average of CPC')
plt.show()