from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
%cd '/content/drive/MyDrive/viewership_forecasting/'
/content/drive/MyDrive/viewership_forecasting
!pip install -q statsmodels
!pip install -q openpyxl
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
import seaborn as sns
import os
from mpl_toolkits.mplot3d import Axes3D
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from mpl_toolkits.mplot3d import Axes3D
from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
warnings.filterwarnings('ignore', 'No frequency information was provided')
warnings.filterwarnings('ignore', category=ValueWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
def get_data():
data = pd.read_excel('NetView_Viewership_Data.xlsx', sheet_name="viewership_data_Nov_Dec_2024",
engine='openpyxl', usecols="A:G", nrows=1585)
return data
def viewers_data_cleaned(data):
df = data[data['viewers'].notna()].copy()
df_to_pred = data[data['viewers'].isna()].copy()
df['hour'] = df['hour'].astype(int)
df['day_of_week'] = df['day_of_week'].astype(int)
df['viewers'] = df['viewers'].astype(int)
df['show_finale'] = df['show_finale'].astype(int)
df['holiday'] = df['holiday'].astype(int)
return df, df_to_pred
data = get_data()
data_with_viewers, data_without_viewers = viewers_data_cleaned(data)
viewers_data = data_with_viewers.copy()
viewers_data
date | hour | day_of_week | viewers | show_finale | holiday | finale_name | |
---|---|---|---|---|---|---|---|
0 | 2024-11-01 | 0 | 5 | 177140 | 0 | 0 | NaN |
1 | 2024-11-01 | 1 | 5 | 148073 | 0 | 0 | NaN |
2 | 2024-11-01 | 2 | 5 | 115361 | 0 | 0 | NaN |
3 | 2024-11-01 | 3 | 5 | 102277 | 0 | 0 | NaN |
4 | 2024-11-01 | 4 | 5 | 103369 | 0 | 0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... |
1459 | 2024-12-31 | 19 | 2 | 588440 | 0 | 1 | NaN |
1460 | 2024-12-31 | 20 | 2 | 764253 | 0 | 1 | NaN |
1461 | 2024-12-31 | 21 | 2 | 734687 | 0 | 1 | NaN |
1462 | 2024-12-31 | 22 | 2 | 676607 | 0 | 1 | NaN |
1463 | 2024-12-31 | 23 | 2 | 420217 | 0 | 1 | NaN |
1464 rows × 7 columns
def train_test_split(data, split_size):
train_size = int(len(data) * split_size)
train_data = data['viewers'].values[:train_size]
test_data = data['viewers'].values[train_size:]
return train_data, test_data
def simple_exponential_smoothing(data, alpha):
forecast = [data[0]]
for n in range(1, len(data)):
forecast.append(alpha * data[n-1] + (1 - alpha) * forecast[n-1])
return forecast
def plot_actual_vs_forecast(actual, forecast, error, finales, title):
plt.figure(figsize=(18, 6))
ax = plt.gca()
ax.plot(actual, label='Actual Viewers', color='green', alpha=0.5)
ax.plot(forecast, label='Forecast', color='orange', linestyle='--')
for i in range(len(finales)):
if finales[i] == 1:
ax.axvline(x=i, color='blue', linestyle='--', alpha=0.3)
plt.ylabel('Number of Viewers')
plt.title(f'Viewers: Actual vs Forecast ({title})')
plt.grid(True, linestyle='--', alpha=0.7)
handles, labels = ax.get_legend_handles_labels()
handles.append(plt.Line2D([0], [0], color='blue', linestyle='-', alpha=0.7))
labels.append('Finale')
plt.legend(handles, labels)
max_error = max(abs(error))
mean_error = np.mean(abs(error))
plt.text(0.8, 0.85, f'Max Abs Error: {max_error:.2f}\nMean Abs Error: {mean_error:.2f}', transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8))
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f'plots/{title}.png', dpi=300, bbox_inches='tight')
plt.show()
def calculate_MAE(actual, forecast):
return np.mean([abs(a - f) for a, f in zip(actual, forecast)])
def test_alphas(alphas, data):
alpha_wise_MAEs = dict()
min_error, best_alpha = None, -1
for alpha in alphas:
forecast = simple_exponential_smoothing(data, alpha)
MAE = calculate_MAE(data, forecast)
alpha_wise_MAEs[alpha] = MAE
if min_error == None or MAE < min_error:
min_error = MAE
best_alpha = alpha
return alpha_wise_MAEs, best_alpha
def plot_alpha_wise_MAE(alpha_wise_MAEs):
alphas = list(alpha_wise_MAEs.keys())
MAEs = list(alpha_wise_MAEs.values())
plt.figure(figsize=(8, 4))
plt.plot(alphas, MAEs, 'b-o')
plt.xlabel('Alpha Value')
plt.ylabel('Mean Absolute Error (MAE)')
plt.title('Alpha vs MAE Relationship')
plt.grid(True, linestyle='--', alpha=0.7)
min_mae = min(MAEs)
best_alpha = alphas[MAEs.index(min_mae)]
plt.plot(best_alpha, min_mae, 'ro')
plt.text(best_alpha, min_mae, f'\nBest α = {best_alpha}\nMAE = {min_mae:,.0f}',
verticalalignment='bottom')
plt.tight_layout()
plt.show()
alpha_vals = np.arange(0.1, 1.1, 0.1)
print(f"Alpha Values: {alpha_vals}")
# Split data into training (75%) and testing (25%)
train_data, test_data = train_test_split(viewers_data, split_size=0.75)
alpha_wise_MAEs, best_alpha = test_alphas(alpha_vals, train_data)
plot_alpha_wise_MAE(alpha_wise_MAEs)
Alpha Values: [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
viewers_data["sim_exp_smo_forecast"] = simple_exponential_smoothing(viewers_data['viewers'].values, best_alpha)
viewers_data["ses_error"] = viewers_data['viewers'] - viewers_data["sim_exp_smo_forecast"]
simple_exponential_smoothing_MAE = np.mean(abs(viewers_data['ses_error']))
print(f"Mean Absolute Error for Simple Exponential Smoothing: {simple_exponential_smoothing_MAE}")
plot_actual_vs_forecast(viewers_data['viewers'], viewers_data["sim_exp_smo_forecast"], viewers_data["ses_error"], viewers_data['show_finale'], 'Simple Exponential Smoothing')
viewers_data.head()
Mean Absolute Error for Simple Exponential Smoothing: 55530.79030054645
date | hour | day_of_week | viewers | show_finale | holiday | finale_name | sim_exp_smo_forecast | ses_error | |
---|---|---|---|---|---|---|---|---|---|
0 | 2024-11-01 | 0 | 5 | 177140 | 0 | 0 | NaN | 177140.0 | 0.0 |
1 | 2024-11-01 | 1 | 5 | 148073 | 0 | 0 | NaN | 177140.0 | -29067.0 |
2 | 2024-11-01 | 2 | 5 | 115361 | 0 | 0 | NaN | 148073.0 | -32712.0 |
3 | 2024-11-01 | 3 | 5 | 102277 | 0 | 0 | NaN | 115361.0 | -13084.0 |
4 | 2024-11-01 | 4 | 5 | 103369 | 0 | 0 | NaN | 102277.0 | 1092.0 |
def double_exponential_smoothing(actual, params):
alpha, beta = params
L = [actual.iloc[0]]
T = [actual.iloc[1] - actual.iloc[0]]
L_T = [L[0] + T[0]]
forecast = [actual.iloc[0]]
for t in range(1, len(actual)):
L_t = (alpha * actual.iloc[t-1]) + ((1 - alpha) * (L[t-1] + T[t-1]))
T_t = (beta * (L_t - L[t-1])) + ((1 - beta) * (T[t-1]))
F_t = (L_t + T_t)
L.append(L_t)
T.append(T_t)
forecast.append(F_t)
return forecast
def plot_parameter_combinations(results_df):
fig = plt.figure(figsize=(15, 5))
alpha_values = sorted(results_df['alpha'].unique())
beta_values = sorted(results_df['beta'].unique())
X, Y = np.meshgrid(alpha_values, beta_values)
Z = results_df.pivot(index='beta', columns='alpha', values='MAE').values
ax1 = fig.add_subplot(131, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Beta')
ax1.set_zlabel('MAE')
ax1.set_title('3D Surface Plot')
fig.colorbar(surf)
ax2 = fig.add_subplot(132)
contour = ax2.contour(X, Y, Z, levels=15)
ax2.clabel(contour, inline=True, fontsize=8)
ax2.set_xlabel('Alpha')
ax2.set_ylabel('Beta')
ax2.set_title('Contour Plot')
ax3 = fig.add_subplot(133)
heatmap = ax3.pcolor(X, Y, Z, cmap='viridis')
ax3.set_xlabel('Alpha')
ax3.set_ylabel('Beta')
ax3.set_title('Heatmap')
fig.colorbar(heatmap)
best_alpha = results_df.loc[results_df['MAE'].idxmin(), 'alpha']
best_beta = results_df.loc[results_df['MAE'].idxmin(), 'beta']
ax2.plot(best_alpha, best_beta, 'r*', markersize=15, label=f'Best (α={best_alpha:.2f}, β={best_beta:.2f})')
ax2.legend()
ax3.plot(best_alpha, best_beta, 'r*', markersize=15)
plt.tight_layout()
plt.show()
alphas = np.arange(0.1, 1.1, 0.1)
betas = np.arange(0.1, 1.1, 0.1)
results = []
for alpha, beta in product(alphas, betas):
params = (alpha, beta)
forecast = double_exponential_smoothing(viewers_data['viewers'], params)
mae = np.mean(abs(viewers_data['viewers'] - forecast))
results.append({'alpha': alpha, 'beta': beta, 'MAE': mae})
results_data = pd.DataFrame(results)
best_params = results_data.loc[results_data['MAE'].idxmin()]
print(f"Best parameters: Alpha: {best_params['alpha']:.2f} Beta: {best_params['beta']:.2f}")
print(f"MAE: {best_params['MAE']:.2f}")
plot_parameter_combinations(results_data)
params = (best_params['alpha'], best_params['beta'])
viewers_data["dou_exp_smo_forecast"] = double_exponential_smoothing(viewers_data['viewers'], params)
viewers_data["des_error"] = viewers_data['viewers'] - viewers_data["dou_exp_smo_forecast"]
double_exponential_smoothing_MAE = np.mean(abs(viewers_data['des_error']))
viewers_data.head()
Best parameters: Alpha: 1.00 Beta: 0.10 MAE: 56471.67
date | hour | day_of_week | viewers | show_finale | holiday | finale_name | sim_exp_smo_forecast | ses_error | dou_exp_smo_forecast | des_error | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2024-11-01 | 0 | 5 | 177140 | 0 | 0 | NaN | 177140.0 | 0.0 | 177140.0000 | 0.0000 |
1 | 2024-11-01 | 1 | 5 | 148073 | 0 | 0 | NaN | 177140.0 | -29067.0 | 150979.7000 | -2906.7000 |
2 | 2024-11-01 | 2 | 5 | 115361 | 0 | 0 | NaN | 148073.0 | -32712.0 | 121622.0300 | -6261.0300 |
3 | 2024-11-01 | 3 | 5 | 102277 | 0 | 0 | NaN | 115361.0 | -13084.0 | 88283.9270 | 13993.0730 |
4 | 2024-11-01 | 4 | 5 | 103369 | 0 | 0 | NaN | 102277.0 | 1092.0 | 76599.2343 | 26769.7657 |
print(f"Mean Absolute Error for Double Exponential Smoothing: {double_exponential_smoothing_MAE}")
plot_actual_vs_forecast(viewers_data['viewers'], viewers_data["dou_exp_smo_forecast"], viewers_data["des_error"], viewers_data['show_finale'], 'Double Exponential Smoothing')
Mean Absolute Error for Double Exponential Smoothing: 56471.6668618948
def triple_exponential_smoothing(actual, params):
alpha, beta, gamma, seasonal_period = params
L = [actual.iloc[0]]
T = [(actual.iloc[1] - actual.iloc[0])]
S = [actual.iloc[i] / (sum(actual.iloc[:seasonal_period]) / seasonal_period) for i in range(seasonal_period)]
forecast = [actual.iloc[0]]
for t in range(1, len(actual)):
if t >= seasonal_period:
# Once we have data for one complete season (a complete week)
L_t = alpha * (actual.iloc[t-1] / S[t - seasonal_period]) + (1 - alpha) * (L[t-1] + T[t-1])
T_t = beta * (L_t - L[t-1]) + (1 - beta) * T[t-1]
S_t = gamma * (actual.iloc[t] / L_t) + (1 - gamma) * S[t - seasonal_period]
else:
# Initially when the model has not seen the seasonality effect
L_t = alpha * actual.iloc[t-1] + (1 - alpha) * (L[t-1] + T[t-1])
T_t = beta * (L_t - L[t-1]) + (1 - beta) * T[t-1]
S_t = 1
L.append(L_t)
T.append(T_t)
S.append(S_t)
F_t = (L_t + T_t) * S_t
forecast.append(F_t)
return L, T, S, forecast
def plot_parameter_combinations(results_data):
fig = plt.figure(figsize=(20, 5))
for i, param in enumerate(['alpha', 'beta', 'gamma']):
ax = fig.add_subplot(141 + i)
param_mae = results_data.groupby(param)['MAE'].mean()
ax.plot(param_mae.index, param_mae.values, ['b-o','g-o','r-o'][i])
ax.set(xlabel=param.capitalize(), ylabel='Average MAE', title=f'{param.capitalize()} vs Average MAE')
ax.plot(param_mae.idxmin(), param_mae.min(), 'r*', markersize=15)
scatter = fig.add_subplot(144, projection='3d').scatter(results_data['alpha'], results_data['beta'], results_data['gamma'], c=results_data['MAE'], cmap='viridis')
fig.axes[-1].set(xlabel='Alpha', ylabel='Beta', zlabel='Gamma', title='Parameter Space\n(color = MAE)')
fig.colorbar(scatter)
plt.tight_layout()
plt.show()
alphas = np.arange(0.1, 1.1, 0.1)
betas = np.arange(0.1, 1.1, 0.1)
gammas = np.arange(0.1, 1.1, 0.1)
seasonal_period = 168
results = []
for alpha, beta, gamma in product(alphas, betas, gammas):
params = (alpha, beta, gamma, seasonal_period)
L, T, S, forecast = triple_exponential_smoothing(viewers_data['viewers'], params)
mae = np.mean(abs(viewers_data['viewers'] - forecast))
results.append({'alpha': alpha, 'beta': beta, 'gamma': gamma, 'MAE': mae})
results_data = pd.DataFrame(results)
top_5_rows = results_data.nsmallest(5, 'MAE')
print("\n5 parameter combinations with lowest test MAE:")
print(top_5_rows.to_string(index=False))
best_params = results_data.loc[results_data['MAE'].idxmin()]
print(f"Best parameters: Alpha: {best_params['alpha']:.2f} Beta: {best_params['beta']:.2f} Gamma: {best_params['gamma']:.2f}")
print(f"MAE: {best_params['MAE']:.2f}")
5 parameter combinations with lowest test MAE: alpha beta gamma MAE 0.1 0.1 1.0 23052.628386 0.1 0.1 0.9 27412.707941 1.0 0.1 0.9 30522.762082 1.0 0.1 1.0 30540.394012 1.0 0.1 0.8 31878.913159 Best parameters: Alpha: 0.10 Beta: 0.10 Gamma: 1.00 MAE: 23052.63
plot_parameter_combinations(results_data)
split_point = int(len(viewers_data) * 0.75)
params = (best_params['alpha'], best_params['beta'], best_params['gamma'], seasonal_period)
L, T, S, viewers_data["tri_exp_smo_forecast"] = triple_exponential_smoothing(viewers_data['viewers'], params)
viewers_data["tes_error"] = viewers_data['viewers'] - viewers_data["tri_exp_smo_forecast"]
train_mae = np.mean(abs(viewers_data['tes_error'][:split_point]))
test_mae = np.mean(abs(viewers_data['tes_error'][split_point:]))
print(f"Training MAE (first 75%): {train_mae:.2f}")
print(f"Test MAE (last 25%): {test_mae:.2f}")
triple_exponential_smoothing_MAE = np.mean(abs(viewers_data['tes_error']))
print(f"Overall MAE: {triple_exponential_smoothing_MAE:.2f}")
viewers_data.head()
Training MAE (first 75%): 27534.28 Test MAE (last 25%): 9607.67 Overall MAE: 23052.63
date | hour | day_of_week | viewers | show_finale | holiday | finale_name | sim_exp_smo_forecast | ses_error | dou_exp_smo_forecast | des_error | tri_exp_smo_forecast | tes_error | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2024-11-01 | 0 | 5 | 177140 | 0 | 0 | NaN | 177140.0 | 0.0 | 177140.0000 | 0.0000 | 177140.000000 | 0.000000 |
1 | 2024-11-01 | 1 | 5 | 148073 | 0 | 0 | NaN | 177140.0 | -29067.0 | 150979.7000 | -2906.7000 | 122203.370000 | 25869.630000 |
2 | 2024-11-01 | 2 | 5 | 115361 | 0 | 0 | NaN | 148073.0 | -32712.0 | 121622.0300 | -6261.0300 | 96272.699300 | 19088.300700 |
3 | 2024-11-01 | 3 | 5 | 102277 | 0 | 0 | NaN | 115361.0 | -13084.0 | 88283.9270 | 13993.0730 | 69854.778677 | 32422.221323 |
4 | 2024-11-01 | 4 | 5 | 103369 | 0 | 0 | NaN | 102277.0 | 1092.0 | 76599.2343 | 26769.7657 | 45094.472330 | 58274.527670 |
# Prediction utilizing 9pm (actual value)
future_forecast = None
L_t = alpha * viewers_data['viewers'].iloc[-4] + (1 - alpha) * (L[-1] + T[-1])
T_t = beta * (L_t - L[-1]) + (1 - beta) * T[-1]
S_t = 1
future_forecast = (L_t + T_t) * S_t
print(f"Viewership Forecast for Show Release: {future_forecast}")
Viewership Forecast for Show Release: 1166685.8254944792
plot_actual_vs_forecast(viewers_data['viewers'], viewers_data["tri_exp_smo_forecast"], viewers_data["tes_error"], viewers_data['show_finale'], 'Triple Exponential Smoothing')
result = viewers_data.groupby('day_of_week')['viewers'].sum().reset_index()
day_counts = viewers_data['day_of_week'].value_counts().reset_index()
day_counts.columns = ['day_of_week', 'Number of Days']
res = result.merge(day_counts, on='day_of_week').sort_values('day_of_week')
# 24 rows for 24 hours. Thus divide by 24
res['Number of Days'] = res['Number of Days'] / 24
res['viewers'] = (res['viewers'].astype(float) / res['Number of Days']).apply(lambda x: f'{x:.2f}')
print(res)
day_of_week viewers Number of Days 0 1 6617471.00 9.0 1 2 6882916.89 9.0 2 3 7027930.62 8.0 3 4 6743724.12 8.0 4 5 7435946.11 9.0 5 6 8940560.56 9.0 6 7 8594574.00 9.0
res['viewers'] = res['viewers'].astype(float)
plt.style.use('default')
plt.figure(figsize=(10, 6))
day_mapping = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat', 7: 'Sun'}
res['day_name'] = res['day_of_week'].map(day_mapping)
ax = sns.barplot(data=res, x='day_name', y='viewers')
plt.title('Average Daily Viewers by Day of Week', pad=15)
plt.xlabel('Day of Week')
plt.ylabel('Average Number of Viewers')
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
for i, v in enumerate(res['viewers']):
ax.text(i, v, format(int(v), ','), ha='center', va='bottom')
plt.tight_layout()
plt.savefig('plots/viewers_by_day.png', dpi=300, bbox_inches='tight')
plt.show()
res = viewers_data.groupby('hour')['viewers'].mean().reset_index()
print(res)
hour viewers 0 0 206832.311475 1 1 156215.262295 2 2 131234.459016 3 3 106288.868852 4 4 106719.098361 5 5 131505.065574 6 6 203406.786885 7 7 255739.114754 8 8 305269.229508 9 9 255328.655738 10 10 230667.475410 11 11 255150.196721 12 12 309540.721311 13 13 358252.606557 14 14 330315.639344 15 15 308026.475410 16 16 361207.852459 17 17 407329.065574 18 18 471122.163934 19 19 524078.295082 20 20 576007.540984 21 21 608685.901639 22 22 520414.344262 23 23 362899.836066
res['viewers'] = res['viewers'].astype(float)
res['hour'] = res['hour'].astype(int)
hour_labels = ['12-1 AM', '1-2 AM', '2-3 AM', '3-4 AM', '4-5 AM', '5-6 AM', '6-7 AM', '7-8 AM', '8-9 AM', '9-10 AM', '10-11 AM', '11-12 PM', '12-1 PM', '1-2 PM', '2-3 PM', '3-4 PM', '4-5 PM', '5-6 PM', '6-7 PM', '7-8 PM', '8-9 PM', '9-10 PM', '10-11 PM', '11-12 AM']
plt.style.use('default')
plt.figure(figsize=(15, 6))
plt.bar(hour_labels, res['viewers'], color='skyblue', edgecolor='navy')
for i, v in enumerate(res['viewers']):
plt.text(i, v * 1.01, f'{v:,.0f}', ha='center', va='bottom')
plt.title('Viewers by Hour', fontsize=14, pad=15)
plt.xlabel('Time of Day', fontsize=12)
plt.ylabel('Number of Viewers', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('plots/viewers_by_hour.png', dpi=300, bbox_inches='tight')
plt.show()
holiday_counts = viewers_data['holiday'].value_counts()
print(f"Number of non-holiday days (0): {holiday_counts[0]}")
print(f"Number of holiday days (1): {holiday_counts[1]}\n")
out = viewers_data.groupby('holiday')['viewers'].mean().reset_index()
print(out)
Number of non-holiday days (0): 1368 Number of holiday days (1): 96 holiday viewers 0 0 304174.430556 1 1 419852.437500
plt.style.use('default')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
counts_data = viewers_data['holiday'].value_counts().reset_index()
counts_data.columns = ['holiday', 'count']
counts_data = counts_data.sort_values('holiday')
colors1 = ['#FF9999', '#66B2FF']
ax1.bar(['Non-Holiday', 'Holiday'], counts_data['count'], color=colors1, edgecolor='navy')
for i, v in enumerate(counts_data['count']):
ax1.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax1.set_title('Number of Holidays', fontsize=14, pad=15)
ax1.set_ylabel('Count of Days', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
mean_data = viewers_data.groupby('holiday')['viewers'].mean().reset_index()
mean_data = mean_data.sort_values('holiday')
colors2 = ['#99FF99', '#FFCC99']
ax2.bar(['Non-Holiday', 'Holiday'], mean_data['viewers'], color=colors2, edgecolor='navy')
for i, v in enumerate(mean_data['viewers']):
ax2.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax2.set_title('Mean Viewership by Holidays', fontsize=14, pad=15)
ax2.set_ylabel('Average Number of Viewers', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('plots/holidays.png', dpi=300, bbox_inches='tight')
plt.show()
finale_counts = viewers_data['show_finale'].value_counts()
print(f"Number of non-finale days (0): {finale_counts[0]}")
print(f"Number of finale days (1): {finale_counts[1]}\n")
out2 = viewers_data.groupby('show_finale')['viewers'].mean().reset_index()
print(out2)
Number of non-finale days (0): 1461 Number of finale days (1): 3 show_finale viewers 0 0 3.097847e+05 1 1 1.273654e+06
plt.style.use('default')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
counts_data = viewers_data['show_finale'].value_counts().reset_index()
counts_data.columns = ['show_finale', 'count']
counts_data = counts_data.sort_values('show_finale')
colors1 = ['#FF9999', '#66B2FF']
ax1.bar(['Non-Show-Finale', 'Show-Finale'], counts_data['count'], color=colors1, edgecolor='navy')
for i, v in enumerate(counts_data['count']):
ax1.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax1.set_title('Number of Days by Show-Finale', fontsize=14, pad=15)
ax1.set_ylabel('Count of Days', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
mean_data = viewers_data.groupby('show_finale')['viewers'].mean().reset_index()
mean_data = mean_data.sort_values('show_finale')
colors2 = ['#99FF99', '#FFCC99']
ax2.bar(['Non-Show-Finale', 'Show-Finale'], mean_data['viewers'], color=colors2, edgecolor='navy')
for i, v in enumerate(mean_data['viewers']):
ax2.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax2.set_title('Mean Viewership by Show-Finale', fontsize=14, pad=15)
ax2.set_ylabel('Average Number of Viewers', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('plots/show_finale.png', dpi=300, bbox_inches='tight')
plt.show()
def train_sarimax_model(df, exog_vars, train_split_size):
train_size = int(len(df) * train_split_size)
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size:]
endog_train = df['viewers']
exog_train = df[exog_vars] if exog_vars else None
model = SARIMAX(
endog_train,
exog=exog_train,
order=(1, 1, 1),
seasonal_order=(1, 1, 1, 24),
enforce_stationarity=False,
enforce_invertibility=False
)
fitted_model = model.fit(disp=False)
return fitted_model, train_data, test_data
def get_model_predictions(model, train_data, test_data, exog_vars):
train_exog = train_data[exog_vars]
test_exog = test_data[exog_vars]
train_predicted = model.get_prediction(start=0, end=len(train_data)-1, exog=train_exog).predicted_mean
test_predicted = model.get_forecast(steps=len(test_data), exog=test_exog).predicted_mean
forecast = pd.concat([pd.Series(train_predicted), pd.Series(test_predicted)])
full_data = pd.concat([train_data, test_data])
train_mae = mean_absolute_error(train_data['viewers'], train_predicted)
test_mae = mean_absolute_error(test_data['viewers'], test_predicted)
overall_mae = mean_absolute_error(full_data['viewers'], forecast)
return {'train_mae':train_mae,'test_mae':test_mae,'mae':overall_mae,'actual':full_data['viewers'],'forecast':forecast,
'train_predicted':train_predicted,'test_predicted':test_predicted,'train':train_data['viewers'],
'test':test_data['viewers']}
def plot_predictions(results):
plt.figure(figsize=(15, 7))
train_index = results['train'].index
test_index = results['test'].index
plt.plot(train_index, results['train'], label='Train Actual')
plt.plot(train_index, results['train_predicted'], label='Train Predicted', linestyle='--', alpha=0.8)
plt.plot(test_index, results['test'], label='Test Actual')
plt.plot(test_index, results['test_predicted'], label='Test Predicted', linestyle='--', alpha=0.8)
plt.title(f'SARIMAX Model: Actual vs Predicted (MAE: {results["mae"]:.2f})')
plt.xlabel('Date')
plt.ylabel('Number of Viewers')
plt.legend()
plt.show()
def prep_for_sarimax(df, exog_vars):
if 'day_of_week' in df.columns and not any(col.startswith('dow_') for col in df.columns):
dow_dummies = pd.get_dummies(df['day_of_week'], prefix='dow').astype(int)
all_days = [f'dow_{i}' for i in range(1, 8)]
for day in all_days:
if day not in dow_dummies.columns:
dow_dummies[day] = 0
dow_dummies = dow_dummies[all_days]
exog_vars.extend(all_days)
df = pd.concat([df, dow_dummies], axis=1)
return df, exog_vars
df, exog_vars = prep_for_sarimax(viewers_data, ['holiday', 'show_finale'])
print(f"Exogenous Variables: {exog_vars}")
df
Exogenous Variables: ['holiday', 'show_finale', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 'dow_7']
date | hour | day_of_week | viewers | show_finale | holiday | finale_name | sim_exp_smo_forecast | ses_error | dou_exp_smo_forecast | des_error | tri_exp_smo_forecast | tes_error | dow_1 | dow_2 | dow_3 | dow_4 | dow_5 | dow_6 | dow_7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2024-11-01 | 0 | 5 | 177140 | 0 | 0 | NaN | 177140.0 | 0.0 | 177140.000000 | 0.000000 | 177140.000000 | 0.000000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 2024-11-01 | 1 | 5 | 148073 | 0 | 0 | NaN | 177140.0 | -29067.0 | 150979.700000 | -2906.700000 | 122203.370000 | 25869.630000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 2024-11-01 | 2 | 5 | 115361 | 0 | 0 | NaN | 148073.0 | -32712.0 | 121622.030000 | -6261.030000 | 96272.699300 | 19088.300700 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 2024-11-01 | 3 | 5 | 102277 | 0 | 0 | NaN | 115361.0 | -13084.0 | 88283.927000 | 13993.073000 | 69854.778677 | 32422.221323 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 2024-11-01 | 4 | 5 | 103369 | 0 | 0 | NaN | 102277.0 | 1092.0 | 76599.234300 | 26769.765700 | 45094.472330 | 58274.527670 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1459 | 2024-12-31 | 19 | 2 | 588440 | 0 | 1 | NaN | 612805.0 | -24365.0 | 637890.025919 | -49450.025919 | 564783.145043 | 23656.854957 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1460 | 2024-12-31 | 20 | 2 | 764253 | 0 | 1 | NaN | 588440.0 | 175813.0 | 608580.023327 | 155672.976673 | 730250.475080 | 34002.524920 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1461 | 2024-12-31 | 21 | 2 | 734687 | 0 | 1 | NaN | 764253.0 | -29566.0 | 799960.320995 | -65273.320995 | 702204.046807 | 32482.953193 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1462 | 2024-12-31 | 22 | 2 | 676607 | 0 | 1 | NaN | 734687.0 | -58080.0 | 763866.988895 | -87259.988895 | 658153.036321 | 18453.963679 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1463 | 2024-12-31 | 23 | 2 | 420217 | 0 | 1 | NaN | 676607.0 | -256390.0 | 697060.990006 | -276843.990006 | 423446.594042 | -3229.594042 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1464 rows × 20 columns
model, train_data, test_data = train_sarimax_model(df, exog_vars, train_split_size=0.75)
results = get_model_predictions(model, train_data, test_data, exog_vars)
plot_predictions(results)
print(f"Train MAE: {results['train_mae']}")
print(f"Test MAE: {results['test_mae']}")
print(f"Overall MAE: {results['mae']}")
Train MAE: 25023.015095586925 Test MAE: 215398.15686336672 Overall MAE: 72616.80053753188
def make_future_predictions(model, df_to_pred, exog_vars):
exog_test = df_to_pred[exog_vars]
forecast = model.get_forecast(steps=len(df_to_pred), exog=exog_test)
predicted_values = forecast.predicted_mean
confidence_intervals = forecast.conf_int()
predicted_df = df_to_pred.copy()
predicted_df['viewers'] = predicted_values
return predicted_df
df_to_pred, exog_vars = prep_for_sarimax(data_without_viewers, ['holiday', 'show_finale'])
print(f"Exogenous Variables: {exog_vars}")
predicted_df = make_future_predictions(model, df_to_pred, exog_vars)
predicted_df
Exogenous Variables: ['holiday', 'show_finale', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 'dow_7']
date | hour | day_of_week | viewers | show_finale | holiday | finale_name | dow_1 | dow_2 | dow_3 | dow_4 | dow_5 | dow_6 | dow_7 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1464 | 2025-01-01 | 0 | 3 | 298791.954794 | 0 | 1 | NaN | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1465 | 2025-01-01 | 1 | 3 | 252381.876048 | 0 | 1 | NaN | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1466 | 2025-01-01 | 2 | 3 | 223855.235285 | 0 | 1 | NaN | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1467 | 2025-01-01 | 3 | 3 | 199042.483767 | 0 | 1 | NaN | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1468 | 2025-01-01 | 4 | 3 | 203293.737002 | 0 | 1 | NaN | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1579 | 2025-01-05 | 19 | 7 | 565183.280929 | 0 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1580 | 2025-01-05 | 20 | 7 | 644691.975336 | 0 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1581 | 2025-01-05 | 21 | 7 | 634358.577117 | 0 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1582 | 2025-01-05 | 22 | 7 | 584218.933195 | 0 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1583 | 2025-01-05 | 23 | 7 | 399675.507313 | 0 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
120 rows × 14 columns
def combine_dataframes(df, df_to_pred):
dow_cols = [col for col in df.columns if col.startswith('dow_')]
df_clean = df.drop(columns=dow_cols, errors='ignore')
df_to_pred_clean = df_to_pred.drop(columns=dow_cols, errors='ignore')
final_df = pd.concat([df_clean, df_to_pred_clean], axis=0)
return final_df
final_df = combine_dataframes(df, predicted_df)
# Exporting predictions using SARIMA
final_df.to_excel('baseline_SARIMA_simple_train_test_split_preds.xlsx', index=False)
def get_train_test_partition(df, train_split_size, offset=0):
n_samples = len(df)
train_size = int(n_samples * train_split_size)
start_idx = offset % (n_samples - train_size)
return df.iloc[start_idx:start_idx+train_size], df.iloc[start_idx+train_size:min(start_idx+train_size+24, n_samples)]
def fit_model_with_method(train_data, exog_vars, order, method, options, best_params=None):
model = SARIMAX(train_data['viewers'], exog=train_data[exog_vars] if exog_vars else None,
order=order, seasonal_order=(1, 1, 1, 24))
if best_params is None:
fitted_model = model.fit(disp=False, method=method, **options)
else:
fitted_model = model.fit(start_params=best_params, disp=False, method=method, **options)
return fitted_model
def train_single_iteration(df, train_split_size, offset, exog_vars, order, methods, best_params):
train_data, test_data = get_train_test_partition(df, train_split_size, offset)
iteration_results, best_iter_mae, best_iter_params, best_iter_method = {}, float('inf'), best_params, None
for method, options in methods.items():
try:
fitted_model = fit_model_with_method(train_data, exog_vars, order, method, options, best_params)
predictions = get_model_predictions(fitted_model, train_data, test_data, exog_vars)
mae = predictions['mae']
iteration_results[f'{method}_mae'] = mae
if mae < best_iter_mae:
best_iter_mae, best_iter_params, best_iter_method = mae, fitted_model.params, method
except Exception as e:
print(f"Error with {method}: {str(e)}")
iteration_results[f'{method}_mae'] = None
return iteration_results, best_iter_mae, best_iter_params, best_iter_method
def make_future_predictions(model, future_df, exog_vars):
future_exog = future_df[exog_vars] if exog_vars else None
forecast = model.get_forecast(steps=len(future_df), exog=future_exog)
return pd.DataFrame({'date': future_df.index,'predicted_viewers': forecast.predicted_mean,'conf_int_lower':
forecast.conf_int()['lower viewers'],'conf_int_upper': forecast.conf_int()['upper viewers']})
def plot_training_history(training_history):
plt.figure(figsize=(12, 6))
for method in ['lbfgs', 'powell', 'nm', 'cg']:
plt.plot(training_history['iteration'], training_history[f'{method}_mae'], label=f'{method} MAE')
plt.title('Model Performance Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('MAE')
plt.legend()
plt.show()
def iterative_train_improve_model(df, exog_vars, iterations, order, convergence_iters, train_split_size=0.75):
warnings.filterwarnings('ignore', category=ConvergenceWarning)
evaluation_results = []
best_mae = float('inf')
best_params = None
best_method = None
methods = {'lbfgs': {'maxiter': convergence_iters}, 'powell': {'maxiter': convergence_iters},
'nm': {'maxiter': convergence_iters}, 'cg': {'maxiter': convergence_iters}}
for i in range(iterations):
print(f"\nIteration {i+1}/{iterations}")
print("-" * 40)
offset = i * 24
iter_results, iter_mae, iter_params, iter_method = train_single_iteration(df, train_split_size,
offset, exog_vars, order, methods, best_params)
iter_results['iteration'] = i
evaluation_results.append(iter_results)
if iter_mae < best_mae:
best_mae, best_params, best_method = iter_mae, iter_params, iter_method
print(f"Current Best Method: {best_method}")
print(f"Current Best MAE: {best_mae:.2f}")
print(f"All Methods MAE:")
for method in methods:
mae = iter_results.get(f'{method}_mae')
if mae is not None:
print(f" {method:6}: {mae:.2f}")
else:
print(f" {method:6}: Failed")
final_model = SARIMAX(df['viewers'], exog=df[exog_vars] if exog_vars else None,
order=order, seasonal_order=(1, 1, 1, 24))
with warnings.catch_warnings(): # Add warning filter for final fit
warnings.simplefilter('ignore')
final_fitted = final_model.fit(start_params=best_params, disp=False, method=best_method)
return {'final_model': final_fitted,'training_history': pd.DataFrame(evaluation_results),'best_method': best_method}
n_iterations = 5
results = iterative_train_improve_model(df=df,exog_vars=exog_vars,iterations=n_iterations,order=(1,1,1),convergence_iters=100)
plot_training_history(results['training_history'])
final_predictions = make_future_predictions(results['final_model'], df_to_pred, exog_vars)
Iteration 1/5 ---------------------------------------- Current Best Method: powell Current Best MAE: 23066.08 All Methods MAE: lbfgs : 23198.22 powell: 23066.08 nm : 24855.28 cg : 536311237734.27 Iteration 2/5 ---------------------------------------- Current Best Method: lbfgs Current Best MAE: 22603.62 All Methods MAE: lbfgs : 22603.62 powell: 22610.85 nm : 22666.99 cg : 55977.23 Iteration 3/5 ---------------------------------------- Current Best Method: powell Current Best MAE: 21331.39 All Methods MAE: lbfgs : 22926.21 powell: 21331.39 nm : 23031.28 cg : 31925.56 Iteration 4/5 ---------------------------------------- Current Best Method: powell Current Best MAE: 21242.85 All Methods MAE: lbfgs : 21263.10 powell: 21242.85 nm : 21265.11 cg : 262438.95 Iteration 5/5 ---------------------------------------- Current Best Method: powell Current Best MAE: 21242.85 All Methods MAE: lbfgs : 22410.36 powell: 22399.42 nm : 22521.87 cg : 57743.11
final_predictions
date | predicted_viewers | conf_int_lower | conf_int_upper | |
---|---|---|---|---|
1464 | 1464 | 334293.523330 | 277984.835146 | 390602.211515 |
1465 | 1465 | 292050.242970 | 229931.081083 | 354169.404857 |
1466 | 1466 | 267921.090414 | 202187.499416 | 333654.681411 |
1467 | 1467 | 243783.591105 | 174852.276391 | 312714.905819 |
1468 | 1468 | 244395.769128 | 172442.374439 | 316349.163817 |
... | ... | ... | ... | ... |
1579 | 1579 | 555275.475453 | 314876.675291 | 795674.275615 |
1580 | 1580 | 606395.991193 | 364949.873253 | 847842.109132 |
1581 | 1581 | 601332.277736 | 358843.270006 | 843821.285465 |
1582 | 1582 | 551390.190891 | 307862.172432 | 794918.209350 |
1583 | 1583 | 394676.292746 | 150110.534120 | 639242.051373 |
120 rows × 4 columns
def combine_dataframes(df, df_to_pred):
dow_cols = [col for col in df.columns if col.startswith('dow_')]
df_clean = df.drop(columns=dow_cols, errors='ignore')
df_to_pred_clean = df_to_pred.drop(columns=dow_cols, errors='ignore')
final_df = pd.concat([df_clean, df_to_pred_clean], axis=0)
return final_df
final_df = combine_dataframes(df, final_predictions)
# Exporting predictions using SARIMA
final_df.to_excel('SARIMA_bootstrapping_preds.xlsx', index=False)