import pandas as pd
import seaborn as srn
import statistics as sts
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from graphviz import Digraph
from IPython.display import Image
url = 'https://docs.google.com/spreadsheets/d/1jd3c7CpUC0pgSxLVYXSFncqVEea3hLOivc3MlNq-axo/gviz/tq?tqx=out:csv'
dataset = pd.read_csv(url)
dataset.head()
# Size
dataset.shape
# Data Type
print(dataset.info())
# Statistical Description
dataset.describe()
# Removal of Irrelevant Columns
dataset = dataset.drop(['Campaign Name', 'CVR', 'CTR'], axis=1)
dataset.head()
# Removal of Symbols ($, %)
dataset['CPC'] = dataset['CPC'].str.replace('[^\d.]', '', regex=True)
print(dataset['CPC'])
# Check NAN
dataset.isnull().sum()
# Replacement of Nulls with Median Values
mediana_clicks = dataset['Clicks'].median()
dataset['Clicks'].fillna(mediana_clicks, inplace=True)
mediana_cpc = dataset['CPC'].median()
dataset['CPC'].fillna(mediana_cpc, inplace=True)
dataset.isnull().sum()
# Checking for duplicates
num_dup = dataset.duplicated().sum()
print(num_dup)
dataset = dataset.drop_duplicates()
num_dup = dataset.duplicated().sum()
print(num_dup)
print(dataset.info())
# Converting Object to Numeric
dataset['CPC'] = pd.to_numeric(dataset['CPC'], errors='coerce')
print(dataset['CPC'])
print(dataset.info())
# Statistical Description
dataset.describe()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Clicks'], ax=ax[0])
ax[0].set_title('Boxplot')
srn.histplot(dataset['Clicks'], bins=5, kde=True, ax=ax[1])
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Impressions'], ax=ax[0],color='coral')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Impressions'], bins=5, kde=True, ax=ax[1],color='coral')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['CPC'], ax=ax[0],color='purple')
ax[0].set_title('Boxplot')
srn.histplot(dataset['CPC'], bins=5, kde=True, ax=ax[1],color='purple')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
srn.boxplot(x=dataset['Conversions'], ax=ax[0],color='green')
ax[0].set_title('Boxplot')
srn.histplot(dataset['Conversions'], bins=5, kde=True, ax=ax[1],color='green')
ax[1].set_title('Histogram')
plt.tight_layout()
plt.show()
# Set a CPC range
Q1 = dataset['CPC'].quantile(0.25)
Q3 = dataset['CPC'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_cpc = dataset[(dataset['CPC'] < limite_inferior) | (dataset['CPC'] > limite_superior)]
print(out_cpc)
# Removing Outliers
dataset = dataset[(dataset['CPC'] >= limite_inferior) & (dataset['CPC'] <= limite_superior)]
dataset.shape
srn.boxplot(x=dataset['CPC'])
print(dataset)
X = dataset[['Clicks', 'Impressions', 'CPC']]
y = dataset['Conversions']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
linear_model.fit(X_train, y_train)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
coefficients = pd.DataFrame(linear_model.coef_, X.columns, columns=['Coefficient'])
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)
print("Coeficientes:\n", coefficients)
print("Intercepto:", linear_model.intercept_)
formula_linear = f"Y = {linear_model.intercept_:.4f}"
for i, coef in enumerate(linear_model.coef_):
formula_linear += f" + {coef:.4f} * X{i + 1}"
print("Multiple Linear Regression:")
print(formula_linear)
ridge_model = Ridge(alpha=1.0) # O parâmetro alpha controla a penalização
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print("Ridge Model:")
print("Mean Squared Error (MSE):", mse_ridge)
print("R² Score:", r2_ridge)
print("Coefficient Ridge:\n", pd.DataFrame(ridge_model.coef_, X.columns, columns=['Coefficient']))
print("Intercepto Ridge:", ridge_model.intercept_)
formula_ridge = f"Y = {ridge_model.intercept_:.4f}"
for i, coef in enumerate(ridge_model.coef_):
formula_ridge += f" + {coef:.4f} * X{i + 1}"
print("Ridge Model Formula:")
print(formula_ridge)
lasso_model = Lasso(alpha=0.01) # O parâmetro alpha controla a penalização
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print("Lasso Model:")
print("Mean Squared Error (MSE):", mse_lasso)
print("R² Score:", r2_lasso)
print("Coeficientes Lasso:\n", pd.DataFrame(lasso_model.coef_, X.columns, columns=['Coefficient']))
print("Intercepto Lasso:", lasso_model.intercept_)
formula_lasso = f"Y = {lasso_model.intercept_:.4f}"
for i, coef in enumerate(lasso_model.coef_):
if coef != 0: # Excluir caracterÃsticas com coeficiente zero
formula_lasso += f" + {coef:.4f} * X{i + 1}"
print("Lasso Model Formula:")
print(formula_lasso)
print("Formulas:")
print("Multiple Regression:",formula_linear)
print("Ridge Regression:",formula_ridge)
print("Lasso Regression:",formula_lasso)
models = ['Linear', 'Ridge', 'Lasso']
mse_values = [mse, mse_ridge, mse_lasso]
r2_values = [r2, r2_ridge, r2_lasso]
x = range(len(models))
plt.figure(figsize=(10, 5))
# Bar chart for R² Score
plt.subplot(1, 2, 1)
bars = plt.bar(x, r2_values, color=['blue', 'purple', 'green'])
# Add labels on top of each bar
for i, bar in enumerate(bars):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01, # Position above the bar
f'{r2_values[i]:.4f}', # Display the R² value with 4 decimal places
ha='center', va='bottom') # Center horizontally, place above the bar
plt.xticks(x, models)
plt.ylabel('R²')
plt.title('Model Performance Comparison (R²)')
plt.ylim(0, max(r2_values) + 0.06) # Adjust the upper limit of the y-axi
plt.tight_layout()
plt.show()
# Create scatter plots for each model
plt.figure(figsize=(18, 6))
# Linear Regression
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--') # Line of equality
plt.title('Linear Regression\nActual vs. Predicted')
plt.xlabel('Actual Conversions')
plt.ylabel('Predicted Conversions')
plt.grid()
# Ridge Regression
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred_ridge, color='green', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--') # Line of equality
plt.title('Ridge Regression\nActual vs. Predicted')
plt.xlabel('Actual Conversions')
plt.ylabel('Predicted Conversions')
plt.grid()
# Lasso Regression
plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred_lasso, color='orange', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--') # Line of equality
plt.title('Lasso Regression\nActual vs. Predicted')
plt.xlabel('Actual Conversions')
plt.ylabel('Predicted Conversions')
plt.grid()
plt.tight_layout()
plt.show()
data_input = pd.DataFrame({
'Clicks': [50],
'Impressions': [1000],
'CPC': [1],
})
linear_prediction = linear_model.predict(data_input)
print(f"Predicted Conversions - Multiple Regression: {linear_prediction[0]:.2f}")
ridge_prediction = ridge_model.predict(data_input)
print(f"Predicted Conversions - Ridge: {ridge_prediction[0]:.2f}")
lasso_prediction = lasso_model.predict(data_input)
print(f"Predicted Conversions - Lasso: {lasso_prediction[0]:.2f}")
# User Input
user_input = []
for column in X.columns:
value = float(input(f"Digite o valor para {column}: "))
user_input.append(value)
input_df = pd.DataFrame([user_input], columns=X.columns)
linear_prediction = linear_model.predict(input_df)[0]
ridge_prediction = ridge_model.predict(input_df)[0]
lasso_prediction = lasso_model.predict(input_df)[0]
print("\nPredictions Results:")
print(f"Multiple Linear Regression Prediction: {linear_prediction:.2f}")
print(f"Ridge Regression Prediction: {ridge_prediction:.2f}")
print(f"Lasso Regression Prediction: {lasso_prediction:.2f}")
# Get the input values for the flowchart
clicks = input_df['Clicks'].values[0]
impressions = input_df['Impressions'].values[0]
cpc = input_df['CPC'].values[0]
dot = Digraph()
# Add the blocks with different colors
dot.node('A', f'Data Input\n(Clicks: {clicks}, Impressions: {impressions}, CPC: {cpc})', shape='ellipse', style='filled', fillcolor='lightblue')
dot.node('B', 'Data Preprocessing\n(Train-Test Split, Scaling)', shape='ellipse', style='filled', fillcolor='lightgreen')
dot.node('C1', 'Model Used\n(Linear Regression)', shape='ellipse', style='filled', fillcolor='pink')
dot.node('C2', 'Model Used\n(Ridge Regression)', shape='ellipse', style='filled', fillcolor='pink')
dot.node('C3', 'Model Used\n(Lasso Regression)', shape='ellipse', style='filled', fillcolor='pink')
dot.node('D1', f'Final Prediction\n(Linear: {linear_prediction:.2f})', shape='ellipse', style='filled', fillcolor='lightyellow')
dot.node('D2', f'Final Prediction\n(Ridge: {ridge_prediction:.2f})', shape='ellipse', style='filled', fillcolor='lightyellow')
dot.node('D3', f'Final Prediction\n(Lasso: {lasso_prediction:.2f})', shape='ellipse', style='filled', fillcolor='lightyellow')
# Add the arrows between blocks
dot.edge('A', 'B')
dot.edge('B', 'C1')
dot.edge('B', 'C2')
dot.edge('B', 'C3')
dot.edge('C1', 'D1')
dot.edge('C2', 'D2')
dot.edge('C3', 'D3')
# Display the diagram inline
display(dot)