import pandas as pd
import seaborn as srn
import statistics as sts
import matplotlib.pyplot as plt
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, accuracy_score, confusion_matrix
from graphviz import Digraph
from IPython.display import Image
url = 'https://docs.google.com/spreadsheets/d/1M89WOPX-krEjzRs7zvjTxZqF9EUuxXW0vjXARdn-e4w/gviz/tq?tqx=out:csv'
dataset = pd.read_csv(url)
dataset.head()
# Size
dataset.shape
# Data Type
print(dataset.info())
# Statistical Description
dataset.describe()
# Removal of Irrelevant Columns
dataset = dataset.drop(['Product'], axis=1)
# Rename Columns
dataset = dataset.rename(columns={'IQ': 'Quality Score'})
dataset.head()
# Removal of Symbols ($, %)
dataset['CPC'] = dataset['CPC'].str.replace('[^\d.]', '', regex=True)
print(dataset['CPC'])
# Check NAN
dataset.isnull().sum()
# Replacement of Nulls with Median Values
mediana_clicks = dataset['Clicks'].median()
dataset['Clicks'].fillna(mediana_clicks, inplace=True)
mediana_cpc = dataset['CPC'].median()
dataset['CPC'].fillna(mediana_cpc, inplace=True)
mediana_qs = dataset['Quality Score'].median()
dataset['Quality Score'].fillna(mediana_qs, inplace=True)
dataset.isnull().sum()
# Checking for duplicates
num_dup = dataset.duplicated().sum()
print(num_dup)
print(dataset.info())
# Converting Object to Numeric
dataset['CPC'] = pd.to_numeric(dataset['CPC'], errors='coerce')
print(dataset['CPC'])
print(dataset.info())
dataset['Spend'] = dataset['CPC'] * dataset['Clicks']
dataset['CTR'] = (dataset['Clicks'] / dataset['Impressions']) * 100
dataset['CPA'] = dataset['Spend'] / dataset['Conversions']
dataset['CVR'] = (dataset['Conversions'] / dataset['Clicks']) * 100
dataset['Revenue'] = dataset['Product Price'] * dataset['Conversions']
dataset['ROAS'] = dataset['Revenue'] / dataset['Spend']
dataset['ROAS >= 3'] = dataset['ROAS'].apply(lambda x: 'Yes' if x >= 3 else 'No')
dataset['ROAS >= 3'] = dataset['ROAS >= 3'].map({'Yes': 1, 'No': 0})
print(dataset.info())
dataset.head()
# Statistical Description
dataset.describe()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Clicks'], ax=ax[0])
ax[0].set_title('Boxplot')
srn.histplot(dataset['Clicks'], bins=5, kde=True, ax=ax[1])
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Impressions'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Impressions'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPC'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPC'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Conversions'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Conversions'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Product Price'], ax=ax[0],color='pink')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Product Price'], bins=5, kde=True, ax=ax[1],color='pink')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Quality Score'], ax=ax[0],color='gray')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Quality Score'], bins=5, kde=True, ax=ax[1],color='gray')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Spend'], ax=ax[0],color='blue')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Spend'], bins=5, kde=True, ax=ax[1],color='blue')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CTR'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CTR'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPA'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPA'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CVR'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CVR'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Revenue'], ax=ax[0],color='gray')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Revenue'], bins=5, kde=True, ax=ax[1],color='gray')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['ROAS'], ax=ax[0],color='pink')
ax[0].set_title('Boxplot')
srn.histplot(dataset['ROAS'], bins=5, kde=True, ax=ax[1],color='pink')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
Q1 = dataset['CPC'].quantile(0.25)
Q3 = dataset['CPC'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_cpc = dataset[(dataset['CPC'] < limite_inferior) | (dataset['CPC'] > limite_superior)]
print(out_cpc)
# Removing Outliers
dataset = dataset[(dataset['CPC'] >= limite_inferior) & (dataset['CPC'] <= limite_superior)]
dataset.shape
srn.boxplot(x=dataset['CPC'])
# Scatterplot Clicks vs Impressions
srn.scatterplot(data=dataset, x='Clicks', y='Impressions')
plt.title('Clicks vs Impressions',fontsize=16)
plt.show()
# Scatterplot Clicks vs Conversions
srn.scatterplot(data=dataset, x='Clicks', y='Conversions', color='green')
plt.title('Clicks vs Conversions',fontsize=16)
plt.show()
# Scatterplot CPC vs Conversions
srn.scatterplot(data=dataset, x='CPC', y='Conversions', color='red')
plt.title('CPC vs Conversions',fontsize=16)
plt.show()
# Line Graph CPC vs Clicks
srn.lineplot(data=dataset, x='CPC', y='Clicks', marker='o', color='blue')
# Line Graph CPC vs Conversions
srn.lineplot(data=dataset, x='CPC', y='Conversions', marker='o', color='violet')
# Line Graph Clicks vs Impressions
srn.lineplot(data=dataset, x='Clicks', y='Impressions', marker='o', color='green')
# Line Graph Clicks vs Conversions
srn.lineplot(data=dataset, x='Clicks', y='Conversions', marker='o', color='brown')
# Define CPC Interval
cpc_bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
dataset['CPCbin'] = pd.cut(dataset['CPC'], bins=cpc_bins)
# Average of Clicks per CPC Interval
mean_clicks = dataset.groupby('CPCbin', observed=False)['Clicks'].mean().sort_values()
mean_clicks.plot(kind='barh', color='purple')
plt.title('Average of Clicks per CPC Interval',fontsize=16)
plt.xlabel('Average of Clicks')
plt.ylabel('Average of CPC')
plt.show()
# Average of Conversions per CPC Interval
cpc_bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
dataset['CPCbin'] = pd.cut(dataset['CPC'], bins=cpc_bins)
# Média de cliques por intervalo de CPC com observed=False
mean_conversions = dataset.groupby('CPCbin', observed=False)['Conversions'].mean().sort_values()
mean_conversions.plot(kind='barh',color='green')
plt.title('Average of Conversions per CPC Interval',fontsize=16)
plt.xlabel('Average of Conversions')
plt.ylabel('Average of CPC')
plt.show()