import pandas as pd
import seaborn as srn
import statistics as sts
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, accuracy_score, confusion_matrix
import pickle
from graphviz import Digraph
from IPython.display import Image
url = 'https://docs.google.com/spreadsheets/d/1M89WOPX-krEjzRs7zvjTxZqF9EUuxXW0vjXARdn-e4w/gviz/tq?tqx=out:csv'
dataset = pd.read_csv(url)
dataset.head()
# Size
dataset.shape
# Data Type
print(dataset.info())
# Statistical Description
dataset.describe()
# Removal of Irrelevant Columns
dataset = dataset.drop(['Product'], axis=1)
# Rename Columns
dataset = dataset.rename(columns={'IQ': 'Quality Score'})
dataset.head()
# Removal of Symbols ($, %)
dataset['CPC'] = dataset['CPC'].str.replace('[^\d.]', '', regex=True)
print(dataset['CPC'])
# Check NAN
dataset.isnull().sum()
# Replacement of Nulls with Median Values
mediana_clicks = dataset['Clicks'].median()
dataset['Clicks'].fillna(mediana_clicks, inplace=True)
mediana_cpc = dataset['CPC'].median()
dataset['CPC'].fillna(mediana_cpc, inplace=True)
mediana_qs = dataset['Quality Score'].median()
dataset['Quality Score'].fillna(mediana_qs, inplace=True)
dataset.isnull().sum()
# Checking for duplicates
num_dup = dataset.duplicated().sum()
print(num_dup)
print(dataset.info())
# Converting Object to Numeric
dataset['CPC'] = pd.to_numeric(dataset['CPC'], errors='coerce')
print(dataset['CPC'])
print(dataset.info())
dataset['Spend'] = dataset['CPC'] * dataset['Clicks']
dataset['CTR'] = (dataset['Clicks'] / dataset['Impressions']) * 100
dataset['CPA'] = dataset['Spend'] / dataset['Conversions']
dataset['CVR'] = (dataset['Conversions'] / dataset['Clicks']) * 100
dataset['Revenue'] = dataset['Product Price'] * dataset['Conversions']
dataset['ROAS'] = dataset['Revenue'] / dataset['Spend']
dataset['ROAS >= 3'] = dataset['ROAS'].apply(lambda x: 'Yes' if x >= 3 else 'No')
dataset['ROAS >= 3'] = dataset['ROAS >= 3'].map({'Yes': 1, 'No': 0})
print(dataset.info())
dataset.head()
# Statistical Description
dataset.describe()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Clicks'], ax=ax[0])
ax[0].set_title('Boxplot')
srn.histplot(dataset['Clicks'], bins=5, kde=True, ax=ax[1])
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Impressions'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Impressions'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPC'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPC'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Conversions'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Conversions'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Product Price'], ax=ax[0],color='pink')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Product Price'], bins=5, kde=True, ax=ax[1],color='pink')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Quality Score'], ax=ax[0],color='gray')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Quality Score'], bins=5, kde=True, ax=ax[1],color='gray')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Spend'], ax=ax[0],color='blue')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Spend'], bins=5, kde=True, ax=ax[1],color='blue')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CTR'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CTR'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPA'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPA'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CVR'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CVR'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Revenue'], ax=ax[0],color='gray')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Revenue'], bins=5, kde=True, ax=ax[1],color='gray')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['ROAS'], ax=ax[0],color='pink')
ax[0].set_title('Boxplot')
srn.histplot(dataset['ROAS'], bins=5, kde=True, ax=ax[1],color='pink')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
Q1 = dataset['CPC'].quantile(0.25)
Q3 = dataset['CPC'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_cpc = dataset[(dataset['CPC'] < limite_inferior) | (dataset['CPC'] > limite_superior)]
print(out_cpc)
# Removing Outliers
dataset = dataset[(dataset['CPC'] >= limite_inferior) & (dataset['CPC'] <= limite_superior)]
dataset.shape
srn.boxplot(x=dataset['CPC'])
class_distribution = dataset['ROAS >= 3'].value_counts()
print(class_distribution)
X = dataset[[ 'CPC', 'Product Price', 'Quality Score', 'Spend', 'CTR', 'CVR']]
y = dataset['ROAS >= 3']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Class balancing
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("Class distribution in the training set after SMOTE:")
print(pd.Series(y_train_smote).value_counts())
log_reg = LogisticRegression()
log_reg.fit(X_train_smote, y_train_smote)
y_pred = log_reg.predict(X_test_scaled)
y_pred_prob = log_reg.predict_proba(X_test_scaled)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
class_report = classification_report(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\n--- Confusion Matrix ---")
print("Confusion Matrix:\n", conf_matrix)
print("True Negatives (TN):", conf_matrix[0, 0])
print("False Positives (FP):", conf_matrix[0, 1])
print("False Negatives (FN):", conf_matrix[1, 0])
print("True Positives (TP):", conf_matrix[1, 1])
print(f"ROC AUC Score: {roc_auc:.2f}")
print("\n--- Classification Report ---")
print(class_report)
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--') # Diagonal dashed line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
# Check the range of predicted probabilities
min_prob = y_pred_prob.min()
max_prob = y_pred_prob.max()
print("Range of predicted probabilities:")
print(f"Min: {min_prob:.10f}, Max: {max_prob:.10f}")
print("\nInterpretation:")
if min_prob < 0.01 and max_prob > 0.99:
print("The predicted probabilities span from near 0 to near 1, which indicates that the model is generating a wide range of confidence levels.")
print("This suggests that the model is effectively distinguishing between classes and should produce a meaningful ROC curve.")
else:
print("The range of predicted probabilities is not as wide as expected.")
print("This might indicate that the model is not distinguishing well between classes or may need further adjustment.")
# Evaluate model on training and test sets
y_train_pred = log_reg.predict(X_train_smote)
y_train_pred_prob = log_reg.predict_proba(X_train_smote)[:, 1]
print("Training Accuracy:", accuracy_score(y_train_smote, y_train_pred))
print("Training ROC AUC Score:", roc_auc_score(y_train_smote, y_train_pred_prob))
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Test ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))
print("\n--- Additional Evaluation Metrics Explained ---")
print("1. **ROC AUC Score**: A measure of how well the model distinguishes between classes. Closer to 1 is better.")
# Create the flowchart
dot = Digraph()
# Add the blocks with different colors
dot.node('A', 'Data Input\n(CPC, Product Price, IQ, Spend, CTR, CVR)', shape='ellipse', style='filled', fillcolor='lightblue')
dot.node('B', 'Processing\n(Logistic Regression Model)', shape='ellipse', style='filled', fillcolor='lightgreen')
dot.node('C', 'Final Prediction\n(ROAS >= 3 or ROAS < 3)', shape='ellipse', style='filled', fillcolor='lightcoral')
# Add the arrows between blocks
dot.edge('A', 'B')
dot.edge('B', 'C')
# Render the diagram
dot.render('flow_diagram', format='png', cleanup=True)
# Display the diagram
Image('flow_diagram.png')
labels = ['ROAS >= 3', 'ROAS < 3']
real_counts = [sum(y_test == 1), sum(y_test == 0)] # Actual
predicted_counts = [sum(y_pred == 1), sum(y_pred == 0)] # Predicted
x = np.arange(len(labels))
plt.figure(figsize=(8, 6))
plt.bar(x - 0.2, real_counts, width=0.4, label='Actual',color='#4F8CFF')
plt.bar(x + 0.2, predicted_counts, width=0.4, label='Predicted', color='salmon')
plt.xticks(x, labels)
plt.xlabel('Categories')
plt.ylabel('Quantity')
plt.title('Actual vs. Predicted Comparison')
plt.legend()
plt.tight_layout()
plt.show()
# Data for the bar chart
labels = ['Correct Predictions', 'Incorrect Predictions']
values = [accuracy, 1 - accuracy]
# Creating the bar chart
plt.figure(figsize=(8, 5))
plt.bar(labels, values, color=['#4F8CFF', '#FF6F61'])
plt.ylim(0, 1)
# Adding titles and labels
plt.title('Model Accuracy')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.text(0, accuracy + 0.02, f'{accuracy * 100:.1f}%', ha='center', color='black')
plt.text(1, (1 - accuracy) + 0.02, f'{(1 - accuracy) * 100:.1f}%', ha='center', color='black')
plt.show()
# Data for the pie chart
sizes = [accuracy, 1 - accuracy]
labels = ['Correct Predictions', 'Incorrect Predictions']
colors = ['#4F8CFF', '#FF6F61']
explode = (0.1, 0) # explode the 1st slice (Correct Predictions)
# Creating the pie chart
plt.figure(figsize=(8, 5))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Model Accuracy')
plt.show()
data_input = pd.DataFrame({
'CPC': [0.75],
'Product Price': [40],
'Quality Score': [8],
'Spend': [337.5],
'CTR': [9.0],
'CVR': [4.0]
})
data_input_scaled = scaler.transform(data_input)
predictions = log_reg.predict(data_input_scaled)
predicted_probabilities = log_reg.predict_proba(data_input_scaled)[:, 1]
predictions_labels = ['Yes' if prob > 0.5 else 'No' for prob in predicted_probabilities]
results = data_input.copy()
results['Predicted'] = predictions_labels
results['Probability'] = predicted_probabilities
print("\nPrediction Results For input data:")
for i, row in results.iterrows():
print(f"\nCPC: {row['CPC']}, Product Price: {row['Product Price']}, IQ: {row['Quality Score']}, Spend: {row['Spend']}, CTR: {row['CTR']}, CVR: {row['CVR']}")
print(f"Predicted class: {row['Predicted']}")
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(f"Predicted probability of 'ROAS 3 or Higher': {row['Probability']:.4f}")
if row['Probability'] > 0.5:
print("Interpretation: The model is confident that this data point falls into 'ROAS 3 or Higher'.")
else:
print("Interpretation: The model is confident that this data point falls into 'ROAS Less than 3'.")
cpc = float(input("Enter CPC: "))
product_price = float(input("Enter Product Price: "))
iq = float(input("Enter Quality Score: "))
spend = float(input("Enter Spend: "))
ctr = float(input("Enter CTR: "))
cvr = float(input("Enter CVR: "))
user_input = pd.DataFrame([[cpc, product_price, iq, spend, ctr, cvr]], columns=['CPC', 'Product Price', 'Quality Score', 'Spend', 'CTR', 'CVR'])
user_input_scaled = scaler.transform(user_input)
prob = log_reg.predict_proba(user_input_scaled)[0, 1]
pred_class = 'ROAS 3 or Higher' if prob > 0.5 else 'Less than ROAS 3'
results = user_input.copy()
results['Predicted'] = pred_class
results['Probability'] = prob
print("\nPrediction Results For User data:")
for i, row in results.iterrows():
print(f"\nCPC: {row['CPC']}, Product Price: {row['Product Price']}, IQ: {row['Quality Score']}, Spend: {row['Spend']}, CTR: {row['CTR']}, CVR: {row['CVR']}")
print(f"Predicted class: {row['Predicted']}")
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(f"Predicted probability of class 'Yes': {row['Probability']:.4f}")
if row['Probability'] > 0.5:
print("Interpretation: The model is confident that this data point falls into 'ROAS 3 or Higher.")
else:
print("Interpretation: The model is confident that this data point falls into 'ROAS Less than 3'.")