In [1]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [2]:
%cd '/content/drive/MyDrive/viewership_forecasting/'
/content/drive/MyDrive/viewership_forecasting
In [3]:
!pip install -q statsmodels
!pip install -q openpyxl
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
import seaborn as sns
import os
from mpl_toolkits.mplot3d import Axes3D
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from mpl_toolkits.mplot3d import Axes3D
from statsmodels.tools.sm_exceptions import ValueWarning
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
warnings.filterwarnings('ignore', 'No frequency information was provided')
warnings.filterwarnings('ignore', category=ValueWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

FETCHING DATA FROM EXCEL FILE -¶

In [4]:
def get_data():
    data = pd.read_excel('NetView_Viewership_Data.xlsx', sheet_name="viewership_data_Nov_Dec_2024",
                         engine='openpyxl', usecols="A:G", nrows=1585)
    return data

def viewers_data_cleaned(data):
    df = data[data['viewers'].notna()].copy()
    df_to_pred = data[data['viewers'].isna()].copy()
    df['hour'] = df['hour'].astype(int)
    df['day_of_week'] = df['day_of_week'].astype(int)
    df['viewers'] = df['viewers'].astype(int)
    df['show_finale'] = df['show_finale'].astype(int)
    df['holiday'] = df['holiday'].astype(int)
    return df, df_to_pred

data = get_data()
data_with_viewers, data_without_viewers = viewers_data_cleaned(data)
viewers_data = data_with_viewers.copy()
viewers_data
Out[4]:
date hour day_of_week viewers show_finale holiday finale_name
0 2024-11-01 0 5 177140 0 0 NaN
1 2024-11-01 1 5 148073 0 0 NaN
2 2024-11-01 2 5 115361 0 0 NaN
3 2024-11-01 3 5 102277 0 0 NaN
4 2024-11-01 4 5 103369 0 0 NaN
... ... ... ... ... ... ... ...
1459 2024-12-31 19 2 588440 0 1 NaN
1460 2024-12-31 20 2 764253 0 1 NaN
1461 2024-12-31 21 2 734687 0 1 NaN
1462 2024-12-31 22 2 676607 0 1 NaN
1463 2024-12-31 23 2 420217 0 1 NaN

1464 rows × 7 columns

SIMPLE EXPONENTIAL SMOOTHING -¶

Train Test Split -¶

In [5]:
def train_test_split(data, split_size):
    train_size = int(len(data) * split_size)
    train_data = data['viewers'].values[:train_size]
    test_data = data['viewers'].values[train_size:]
    return train_data, test_data

Model Definition & Plotting Function -¶

In [6]:
def simple_exponential_smoothing(data, alpha):
    forecast = [data[0]]
    for n in range(1, len(data)):
        forecast.append(alpha * data[n-1] + (1 - alpha) * forecast[n-1])
    return forecast

def plot_actual_vs_forecast(actual, forecast, error, finales, title):
    plt.figure(figsize=(18, 6))
    ax = plt.gca()
    ax.plot(actual, label='Actual Viewers', color='green', alpha=0.5)
    ax.plot(forecast, label='Forecast', color='orange', linestyle='--')
    for i in range(len(finales)):
        if finales[i] == 1:
            ax.axvline(x=i, color='blue', linestyle='--', alpha=0.3)
    plt.ylabel('Number of Viewers')
    plt.title(f'Viewers: Actual vs Forecast ({title})')
    plt.grid(True, linestyle='--', alpha=0.7)
    handles, labels = ax.get_legend_handles_labels()
    handles.append(plt.Line2D([0], [0], color='blue', linestyle='-', alpha=0.7))
    labels.append('Finale')
    plt.legend(handles, labels)
    max_error = max(abs(error))
    mean_error = np.mean(abs(error))
    plt.text(0.8, 0.85, f'Max Abs Error: {max_error:.2f}\nMean Abs Error: {mean_error:.2f}', transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8))
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'plots/{title}.png', dpi=300, bbox_inches='tight')
    plt.show()

def calculate_MAE(actual, forecast):
    return np.mean([abs(a - f) for a, f in zip(actual, forecast)])

Testing for Best Alpha -¶

In [7]:
def test_alphas(alphas, data):
    alpha_wise_MAEs = dict()
    min_error, best_alpha = None, -1
    for alpha in alphas:
        forecast = simple_exponential_smoothing(data, alpha)
        MAE = calculate_MAE(data, forecast)
        alpha_wise_MAEs[alpha] = MAE
        if min_error == None or MAE < min_error:
            min_error = MAE
            best_alpha = alpha
    return alpha_wise_MAEs, best_alpha

def plot_alpha_wise_MAE(alpha_wise_MAEs):
    alphas = list(alpha_wise_MAEs.keys())
    MAEs = list(alpha_wise_MAEs.values())
    plt.figure(figsize=(8, 4))
    plt.plot(alphas, MAEs, 'b-o')
    plt.xlabel('Alpha Value')
    plt.ylabel('Mean Absolute Error (MAE)')
    plt.title('Alpha vs MAE Relationship')
    plt.grid(True, linestyle='--', alpha=0.7)
    min_mae = min(MAEs)
    best_alpha = alphas[MAEs.index(min_mae)]
    plt.plot(best_alpha, min_mae, 'ro')
    plt.text(best_alpha, min_mae, f'\nBest α = {best_alpha}\nMAE = {min_mae:,.0f}',
            verticalalignment='bottom')
    plt.tight_layout()
    plt.show()
In [8]:
alpha_vals = np.arange(0.1, 1.1, 0.1)
print(f"Alpha Values: {alpha_vals}")
# Split data into training (75%) and testing (25%)
train_data, test_data = train_test_split(viewers_data, split_size=0.75)
alpha_wise_MAEs, best_alpha = test_alphas(alpha_vals, train_data)
plot_alpha_wise_MAE(alpha_wise_MAEs)
Alpha Values: [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]

Plot Results -¶

In [9]:
viewers_data["sim_exp_smo_forecast"] = simple_exponential_smoothing(viewers_data['viewers'].values, best_alpha)
viewers_data["ses_error"] = viewers_data['viewers'] - viewers_data["sim_exp_smo_forecast"]
simple_exponential_smoothing_MAE = np.mean(abs(viewers_data['ses_error']))
print(f"Mean Absolute Error for Simple Exponential Smoothing: {simple_exponential_smoothing_MAE}")
plot_actual_vs_forecast(viewers_data['viewers'], viewers_data["sim_exp_smo_forecast"], viewers_data["ses_error"], viewers_data['show_finale'], 'Simple Exponential Smoothing')

viewers_data.head()
Mean Absolute Error for Simple Exponential Smoothing: 55530.79030054645
Out[9]:
date hour day_of_week viewers show_finale holiday finale_name sim_exp_smo_forecast ses_error
0 2024-11-01 0 5 177140 0 0 NaN 177140.0 0.0
1 2024-11-01 1 5 148073 0 0 NaN 177140.0 -29067.0
2 2024-11-01 2 5 115361 0 0 NaN 148073.0 -32712.0
3 2024-11-01 3 5 102277 0 0 NaN 115361.0 -13084.0
4 2024-11-01 4 5 103369 0 0 NaN 102277.0 1092.0

DOUBLE EXPONENTIAL SMOOTHING -¶

In [10]:
def double_exponential_smoothing(actual, params):
    alpha, beta = params
    L = [actual.iloc[0]]
    T = [actual.iloc[1] - actual.iloc[0]]
    L_T = [L[0] + T[0]]
    forecast = [actual.iloc[0]]
    for t in range(1, len(actual)):
        L_t = (alpha * actual.iloc[t-1]) + ((1 - alpha) * (L[t-1] + T[t-1]))
        T_t = (beta * (L_t - L[t-1])) + ((1 - beta) * (T[t-1]))
        F_t = (L_t + T_t)
        L.append(L_t)
        T.append(T_t)
        forecast.append(F_t)
    return forecast
In [11]:
def plot_parameter_combinations(results_df):
    fig = plt.figure(figsize=(15, 5))
    alpha_values = sorted(results_df['alpha'].unique())
    beta_values = sorted(results_df['beta'].unique())
    X, Y = np.meshgrid(alpha_values, beta_values)
    Z = results_df.pivot(index='beta', columns='alpha', values='MAE').values
    ax1 = fig.add_subplot(131, projection='3d')
    surf = ax1.plot_surface(X, Y, Z, cmap='viridis')
    ax1.set_xlabel('Alpha')
    ax1.set_ylabel('Beta')
    ax1.set_zlabel('MAE')
    ax1.set_title('3D Surface Plot')
    fig.colorbar(surf)
    ax2 = fig.add_subplot(132)
    contour = ax2.contour(X, Y, Z, levels=15)
    ax2.clabel(contour, inline=True, fontsize=8)
    ax2.set_xlabel('Alpha')
    ax2.set_ylabel('Beta')
    ax2.set_title('Contour Plot')
    ax3 = fig.add_subplot(133)
    heatmap = ax3.pcolor(X, Y, Z, cmap='viridis')
    ax3.set_xlabel('Alpha')
    ax3.set_ylabel('Beta')
    ax3.set_title('Heatmap')
    fig.colorbar(heatmap)
    best_alpha = results_df.loc[results_df['MAE'].idxmin(), 'alpha']
    best_beta = results_df.loc[results_df['MAE'].idxmin(), 'beta']
    ax2.plot(best_alpha, best_beta, 'r*', markersize=15, label=f'Best (α={best_alpha:.2f}, β={best_beta:.2f})')
    ax2.legend()
    ax3.plot(best_alpha, best_beta, 'r*', markersize=15)
    plt.tight_layout()
    plt.show()

Testing for Best Alpha & Beta Combination -¶

In [12]:
alphas = np.arange(0.1, 1.1, 0.1)
betas = np.arange(0.1, 1.1, 0.1)
results = []
for alpha, beta in product(alphas, betas):
    params = (alpha, beta)
    forecast = double_exponential_smoothing(viewers_data['viewers'], params)
    mae = np.mean(abs(viewers_data['viewers'] - forecast))
    results.append({'alpha': alpha, 'beta': beta, 'MAE': mae})
results_data = pd.DataFrame(results)
best_params = results_data.loc[results_data['MAE'].idxmin()]
print(f"Best parameters:        Alpha: {best_params['alpha']:.2f}       Beta: {best_params['beta']:.2f}")
print(f"MAE: {best_params['MAE']:.2f}")
plot_parameter_combinations(results_data)

params = (best_params['alpha'], best_params['beta'])
viewers_data["dou_exp_smo_forecast"] = double_exponential_smoothing(viewers_data['viewers'], params)
viewers_data["des_error"] = viewers_data['viewers'] - viewers_data["dou_exp_smo_forecast"]
double_exponential_smoothing_MAE = np.mean(abs(viewers_data['des_error']))
viewers_data.head()
Best parameters:        Alpha: 1.00       Beta: 0.10
MAE: 56471.67
Out[12]:
date hour day_of_week viewers show_finale holiday finale_name sim_exp_smo_forecast ses_error dou_exp_smo_forecast des_error
0 2024-11-01 0 5 177140 0 0 NaN 177140.0 0.0 177140.0000 0.0000
1 2024-11-01 1 5 148073 0 0 NaN 177140.0 -29067.0 150979.7000 -2906.7000
2 2024-11-01 2 5 115361 0 0 NaN 148073.0 -32712.0 121622.0300 -6261.0300
3 2024-11-01 3 5 102277 0 0 NaN 115361.0 -13084.0 88283.9270 13993.0730
4 2024-11-01 4 5 103369 0 0 NaN 102277.0 1092.0 76599.2343 26769.7657
In [13]:
print(f"Mean Absolute Error for Double Exponential Smoothing: {double_exponential_smoothing_MAE}")
plot_actual_vs_forecast(viewers_data['viewers'], viewers_data["dou_exp_smo_forecast"], viewers_data["des_error"], viewers_data['show_finale'], 'Double Exponential Smoothing')
Mean Absolute Error for Double Exponential Smoothing: 56471.6668618948

TRIPLE EXPONENTIAL SMOOTHING -¶

In [14]:
def triple_exponential_smoothing(actual, params):
    alpha, beta, gamma, seasonal_period = params
    L = [actual.iloc[0]]
    T = [(actual.iloc[1] - actual.iloc[0])]
    S = [actual.iloc[i] / (sum(actual.iloc[:seasonal_period]) / seasonal_period) for i in range(seasonal_period)]
    forecast = [actual.iloc[0]]
    for t in range(1, len(actual)):
        if t >= seasonal_period:
            # Once we have data for one complete season (a complete week)
            L_t = alpha * (actual.iloc[t-1] / S[t - seasonal_period]) + (1 - alpha) * (L[t-1] + T[t-1])
            T_t = beta * (L_t - L[t-1]) + (1 - beta) * T[t-1]
            S_t = gamma * (actual.iloc[t] / L_t) + (1 - gamma) * S[t - seasonal_period]
        else:
            # Initially when the model has not seen the seasonality effect
            L_t = alpha * actual.iloc[t-1] + (1 - alpha) * (L[t-1] + T[t-1])
            T_t = beta * (L_t - L[t-1]) + (1 - beta) * T[t-1]
            S_t = 1
        L.append(L_t)
        T.append(T_t)
        S.append(S_t)
        F_t = (L_t + T_t) * S_t
        forecast.append(F_t)

    return L, T, S, forecast
In [15]:
def plot_parameter_combinations(results_data):
    fig = plt.figure(figsize=(20, 5))
    for i, param in enumerate(['alpha', 'beta', 'gamma']):
        ax = fig.add_subplot(141 + i)
        param_mae = results_data.groupby(param)['MAE'].mean()
        ax.plot(param_mae.index, param_mae.values, ['b-o','g-o','r-o'][i])
        ax.set(xlabel=param.capitalize(), ylabel='Average MAE', title=f'{param.capitalize()} vs Average MAE')
        ax.plot(param_mae.idxmin(), param_mae.min(), 'r*', markersize=15)
    scatter = fig.add_subplot(144, projection='3d').scatter(results_data['alpha'], results_data['beta'], results_data['gamma'], c=results_data['MAE'], cmap='viridis')
    fig.axes[-1].set(xlabel='Alpha', ylabel='Beta', zlabel='Gamma', title='Parameter Space\n(color = MAE)')
    fig.colorbar(scatter)
    plt.tight_layout()
    plt.show()

alphas = np.arange(0.1, 1.1, 0.1)
betas = np.arange(0.1, 1.1, 0.1)
gammas = np.arange(0.1, 1.1, 0.1)
seasonal_period = 168
results = []
for alpha, beta, gamma in product(alphas, betas, gammas):
    params = (alpha, beta, gamma, seasonal_period)
    L, T, S, forecast = triple_exponential_smoothing(viewers_data['viewers'], params)
    mae = np.mean(abs(viewers_data['viewers'] - forecast))
    results.append({'alpha': alpha, 'beta': beta, 'gamma': gamma, 'MAE': mae})

Testing for Best Alpha, Beta, and Gamma Combination -¶

In [16]:
results_data = pd.DataFrame(results)
top_5_rows = results_data.nsmallest(5, 'MAE')
print("\n5 parameter combinations with lowest test MAE:")
print(top_5_rows.to_string(index=False))

best_params = results_data.loc[results_data['MAE'].idxmin()]
print(f"Best parameters:     Alpha: {best_params['alpha']:.2f}       Beta: {best_params['beta']:.2f}     Gamma: {best_params['gamma']:.2f}")
print(f"MAE: {best_params['MAE']:.2f}")
5 parameter combinations with lowest test MAE:
 alpha  beta  gamma          MAE
   0.1   0.1    1.0 23052.628386
   0.1   0.1    0.9 27412.707941
   1.0   0.1    0.9 30522.762082
   1.0   0.1    1.0 30540.394012
   1.0   0.1    0.8 31878.913159
Best parameters:     Alpha: 0.10       Beta: 0.10     Gamma: 1.00
MAE: 23052.63
In [17]:
plot_parameter_combinations(results_data)
In [18]:
split_point = int(len(viewers_data) * 0.75)
params = (best_params['alpha'], best_params['beta'], best_params['gamma'], seasonal_period)
L, T, S, viewers_data["tri_exp_smo_forecast"] = triple_exponential_smoothing(viewers_data['viewers'], params)
viewers_data["tes_error"] = viewers_data['viewers'] - viewers_data["tri_exp_smo_forecast"]
train_mae = np.mean(abs(viewers_data['tes_error'][:split_point]))
test_mae = np.mean(abs(viewers_data['tes_error'][split_point:]))
print(f"Training MAE (first 75%): {train_mae:.2f}")
print(f"Test MAE (last 25%): {test_mae:.2f}")
triple_exponential_smoothing_MAE = np.mean(abs(viewers_data['tes_error']))
print(f"Overall MAE: {triple_exponential_smoothing_MAE:.2f}")
viewers_data.head()
Training MAE (first 75%): 27534.28
Test MAE (last 25%): 9607.67
Overall MAE: 23052.63
Out[18]:
date hour day_of_week viewers show_finale holiday finale_name sim_exp_smo_forecast ses_error dou_exp_smo_forecast des_error tri_exp_smo_forecast tes_error
0 2024-11-01 0 5 177140 0 0 NaN 177140.0 0.0 177140.0000 0.0000 177140.000000 0.000000
1 2024-11-01 1 5 148073 0 0 NaN 177140.0 -29067.0 150979.7000 -2906.7000 122203.370000 25869.630000
2 2024-11-01 2 5 115361 0 0 NaN 148073.0 -32712.0 121622.0300 -6261.0300 96272.699300 19088.300700
3 2024-11-01 3 5 102277 0 0 NaN 115361.0 -13084.0 88283.9270 13993.0730 69854.778677 32422.221323
4 2024-11-01 4 5 103369 0 0 NaN 102277.0 1092.0 76599.2343 26769.7657 45094.472330 58274.527670

Final Prediction -¶

In [19]:
# Prediction utilizing 9pm (actual value)
future_forecast = None
L_t = alpha * viewers_data['viewers'].iloc[-4] + (1 - alpha) * (L[-1] + T[-1])
T_t = beta * (L_t - L[-1]) + (1 - beta) * T[-1]
S_t = 1
future_forecast = (L_t + T_t) * S_t
print(f"Viewership Forecast for Show Release: {future_forecast}")
Viewership Forecast for Show Release: 1166685.8254944792
In [20]:
plot_actual_vs_forecast(viewers_data['viewers'], viewers_data["tri_exp_smo_forecast"], viewers_data["tes_error"], viewers_data['show_finale'], 'Triple Exponential Smoothing')

EXPLORATORY DATA ANALYSIS -¶

Viewers by Day -¶

In [21]:
result = viewers_data.groupby('day_of_week')['viewers'].sum().reset_index()
day_counts = viewers_data['day_of_week'].value_counts().reset_index()
day_counts.columns = ['day_of_week', 'Number of Days']
res = result.merge(day_counts, on='day_of_week').sort_values('day_of_week')
# 24 rows for 24 hours. Thus divide by 24
res['Number of Days'] = res['Number of Days'] / 24
res['viewers'] = (res['viewers'].astype(float) / res['Number of Days']).apply(lambda x: f'{x:.2f}')
print(res)
   day_of_week     viewers  Number of Days
0            1  6617471.00             9.0
1            2  6882916.89             9.0
2            3  7027930.62             8.0
3            4  6743724.12             8.0
4            5  7435946.11             9.0
5            6  8940560.56             9.0
6            7  8594574.00             9.0
In [22]:
res['viewers'] = res['viewers'].astype(float)
plt.style.use('default')
plt.figure(figsize=(10, 6))
day_mapping = {1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat', 7: 'Sun'}
res['day_name'] = res['day_of_week'].map(day_mapping)
ax = sns.barplot(data=res, x='day_name', y='viewers')
plt.title('Average Daily Viewers by Day of Week', pad=15)
plt.xlabel('Day of Week')
plt.ylabel('Average Number of Viewers')
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
for i, v in enumerate(res['viewers']):
   ax.text(i, v, format(int(v), ','), ha='center', va='bottom')
plt.tight_layout()
plt.savefig('plots/viewers_by_day.png', dpi=300, bbox_inches='tight')
plt.show()

Viewers by Hour -¶

In [23]:
res = viewers_data.groupby('hour')['viewers'].mean().reset_index()
print(res)
    hour        viewers
0      0  206832.311475
1      1  156215.262295
2      2  131234.459016
3      3  106288.868852
4      4  106719.098361
5      5  131505.065574
6      6  203406.786885
7      7  255739.114754
8      8  305269.229508
9      9  255328.655738
10    10  230667.475410
11    11  255150.196721
12    12  309540.721311
13    13  358252.606557
14    14  330315.639344
15    15  308026.475410
16    16  361207.852459
17    17  407329.065574
18    18  471122.163934
19    19  524078.295082
20    20  576007.540984
21    21  608685.901639
22    22  520414.344262
23    23  362899.836066
In [24]:
res['viewers'] = res['viewers'].astype(float)
res['hour'] = res['hour'].astype(int)

hour_labels = ['12-1 AM', '1-2 AM', '2-3 AM', '3-4 AM', '4-5 AM', '5-6 AM', '6-7 AM', '7-8 AM', '8-9 AM', '9-10 AM', '10-11 AM', '11-12 PM', '12-1 PM', '1-2 PM', '2-3 PM', '3-4 PM', '4-5 PM', '5-6 PM', '6-7 PM', '7-8 PM', '8-9 PM', '9-10 PM', '10-11 PM', '11-12 AM']
plt.style.use('default')
plt.figure(figsize=(15, 6))
plt.bar(hour_labels, res['viewers'], color='skyblue', edgecolor='navy')
for i, v in enumerate(res['viewers']):
   plt.text(i, v * 1.01, f'{v:,.0f}', ha='center', va='bottom')
plt.title('Viewers by Hour', fontsize=14, pad=15)
plt.xlabel('Time of Day', fontsize=12)
plt.ylabel('Number of Viewers', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('plots/viewers_by_hour.png', dpi=300, bbox_inches='tight')
plt.show()

Viewers by Holiday -¶

In [25]:
holiday_counts = viewers_data['holiday'].value_counts()
print(f"Number of non-holiday days (0): {holiday_counts[0]}")
print(f"Number of holiday days (1): {holiday_counts[1]}\n")
out = viewers_data.groupby('holiday')['viewers'].mean().reset_index()
print(out)
Number of non-holiday days (0): 1368
Number of holiday days (1): 96

   holiday        viewers
0        0  304174.430556
1        1  419852.437500
In [26]:
plt.style.use('default')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
counts_data = viewers_data['holiday'].value_counts().reset_index()
counts_data.columns = ['holiday', 'count']
counts_data = counts_data.sort_values('holiday')
colors1 = ['#FF9999', '#66B2FF']
ax1.bar(['Non-Holiday', 'Holiday'], counts_data['count'], color=colors1, edgecolor='navy')
for i, v in enumerate(counts_data['count']):
   ax1.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax1.set_title('Number of Holidays', fontsize=14, pad=15)
ax1.set_ylabel('Count of Days', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
mean_data = viewers_data.groupby('holiday')['viewers'].mean().reset_index()
mean_data = mean_data.sort_values('holiday')
colors2 = ['#99FF99', '#FFCC99']
ax2.bar(['Non-Holiday', 'Holiday'], mean_data['viewers'], color=colors2, edgecolor='navy')
for i, v in enumerate(mean_data['viewers']):
   ax2.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax2.set_title('Mean Viewership by Holidays', fontsize=14, pad=15)
ax2.set_ylabel('Average Number of Viewers', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('plots/holidays.png', dpi=300, bbox_inches='tight')
plt.show()

Viewers by Finale -¶

In [27]:
finale_counts = viewers_data['show_finale'].value_counts()
print(f"Number of non-finale days (0): {finale_counts[0]}")
print(f"Number of finale days (1): {finale_counts[1]}\n")
out2 = viewers_data.groupby('show_finale')['viewers'].mean().reset_index()
print(out2)
Number of non-finale days (0): 1461
Number of finale days (1): 3

   show_finale       viewers
0            0  3.097847e+05
1            1  1.273654e+06
In [28]:
plt.style.use('default')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
counts_data = viewers_data['show_finale'].value_counts().reset_index()
counts_data.columns = ['show_finale', 'count']
counts_data = counts_data.sort_values('show_finale')
colors1 = ['#FF9999', '#66B2FF']

ax1.bar(['Non-Show-Finale', 'Show-Finale'], counts_data['count'], color=colors1, edgecolor='navy')
for i, v in enumerate(counts_data['count']):
   ax1.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax1.set_title('Number of Days by Show-Finale', fontsize=14, pad=15)
ax1.set_ylabel('Count of Days', fontsize=12)
ax1.grid(True, linestyle='--', alpha=0.3, axis='y')
mean_data = viewers_data.groupby('show_finale')['viewers'].mean().reset_index()
mean_data = mean_data.sort_values('show_finale')
colors2 = ['#99FF99', '#FFCC99']
ax2.bar(['Non-Show-Finale', 'Show-Finale'], mean_data['viewers'], color=colors2, edgecolor='navy')
for i, v in enumerate(mean_data['viewers']):
   ax2.text(i, v + v*0.01, f'{v:,.0f}', ha='center', va='bottom')
ax2.set_title('Mean Viewership by Show-Finale', fontsize=14, pad=15)
ax2.set_ylabel('Average Number of Viewers', fontsize=12)
ax2.grid(True, linestyle='--', alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('plots/show_finale.png', dpi=300, bbox_inches='tight')
plt.show()

SARIMA SETUP -¶

In [29]:
def train_sarimax_model(df, exog_vars, train_split_size):
    train_size = int(len(df) * train_split_size)
    train_data = df.iloc[:train_size]
    test_data = df.iloc[train_size:]

    endog_train = df['viewers']
    exog_train = df[exog_vars] if exog_vars else None

    model = SARIMAX(
        endog_train,
        exog=exog_train,
        order=(1, 1, 1),
        seasonal_order=(1, 1, 1, 24),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    fitted_model = model.fit(disp=False)
    return fitted_model, train_data, test_data
In [30]:
def get_model_predictions(model, train_data, test_data, exog_vars):
    train_exog = train_data[exog_vars]
    test_exog = test_data[exog_vars]
    train_predicted = model.get_prediction(start=0, end=len(train_data)-1, exog=train_exog).predicted_mean
    test_predicted = model.get_forecast(steps=len(test_data), exog=test_exog).predicted_mean
    forecast = pd.concat([pd.Series(train_predicted), pd.Series(test_predicted)])
    full_data = pd.concat([train_data, test_data])
    train_mae = mean_absolute_error(train_data['viewers'], train_predicted)
    test_mae = mean_absolute_error(test_data['viewers'], test_predicted)
    overall_mae = mean_absolute_error(full_data['viewers'], forecast)
    return {'train_mae':train_mae,'test_mae':test_mae,'mae':overall_mae,'actual':full_data['viewers'],'forecast':forecast,
           'train_predicted':train_predicted,'test_predicted':test_predicted,'train':train_data['viewers'],
           'test':test_data['viewers']}

def plot_predictions(results):
    plt.figure(figsize=(15, 7))
    train_index = results['train'].index
    test_index = results['test'].index
    plt.plot(train_index, results['train'], label='Train Actual')
    plt.plot(train_index, results['train_predicted'], label='Train Predicted', linestyle='--', alpha=0.8)
    plt.plot(test_index, results['test'], label='Test Actual')
    plt.plot(test_index, results['test_predicted'], label='Test Predicted', linestyle='--', alpha=0.8)
    plt.title(f'SARIMAX Model: Actual vs Predicted (MAE: {results["mae"]:.2f})')
    plt.xlabel('Date')
    plt.ylabel('Number of Viewers')
    plt.legend()
    plt.show()

DEFINING EXOGENOUS VARIABLES & RUNNING SARIMA -¶

In [31]:
def prep_for_sarimax(df, exog_vars):
   if 'day_of_week' in df.columns and not any(col.startswith('dow_') for col in df.columns):
       dow_dummies = pd.get_dummies(df['day_of_week'], prefix='dow').astype(int)
       all_days = [f'dow_{i}' for i in range(1, 8)]
       for day in all_days:
           if day not in dow_dummies.columns:
               dow_dummies[day] = 0
       dow_dummies = dow_dummies[all_days]
       exog_vars.extend(all_days)
       df = pd.concat([df, dow_dummies], axis=1)
   return df, exog_vars

df, exog_vars = prep_for_sarimax(viewers_data, ['holiday', 'show_finale'])
print(f"Exogenous Variables: {exog_vars}")
df
Exogenous Variables: ['holiday', 'show_finale', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 'dow_7']
Out[31]:
date hour day_of_week viewers show_finale holiday finale_name sim_exp_smo_forecast ses_error dou_exp_smo_forecast des_error tri_exp_smo_forecast tes_error dow_1 dow_2 dow_3 dow_4 dow_5 dow_6 dow_7
0 2024-11-01 0 5 177140 0 0 NaN 177140.0 0.0 177140.000000 0.000000 177140.000000 0.000000 0 0 0 0 1 0 0
1 2024-11-01 1 5 148073 0 0 NaN 177140.0 -29067.0 150979.700000 -2906.700000 122203.370000 25869.630000 0 0 0 0 1 0 0
2 2024-11-01 2 5 115361 0 0 NaN 148073.0 -32712.0 121622.030000 -6261.030000 96272.699300 19088.300700 0 0 0 0 1 0 0
3 2024-11-01 3 5 102277 0 0 NaN 115361.0 -13084.0 88283.927000 13993.073000 69854.778677 32422.221323 0 0 0 0 1 0 0
4 2024-11-01 4 5 103369 0 0 NaN 102277.0 1092.0 76599.234300 26769.765700 45094.472330 58274.527670 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1459 2024-12-31 19 2 588440 0 1 NaN 612805.0 -24365.0 637890.025919 -49450.025919 564783.145043 23656.854957 0 1 0 0 0 0 0
1460 2024-12-31 20 2 764253 0 1 NaN 588440.0 175813.0 608580.023327 155672.976673 730250.475080 34002.524920 0 1 0 0 0 0 0
1461 2024-12-31 21 2 734687 0 1 NaN 764253.0 -29566.0 799960.320995 -65273.320995 702204.046807 32482.953193 0 1 0 0 0 0 0
1462 2024-12-31 22 2 676607 0 1 NaN 734687.0 -58080.0 763866.988895 -87259.988895 658153.036321 18453.963679 0 1 0 0 0 0 0
1463 2024-12-31 23 2 420217 0 1 NaN 676607.0 -256390.0 697060.990006 -276843.990006 423446.594042 -3229.594042 0 1 0 0 0 0 0

1464 rows × 20 columns

BASELINE SARIMAX - Single Test Train Split¶

In [32]:
model, train_data, test_data = train_sarimax_model(df, exog_vars, train_split_size=0.75)
In [33]:
results = get_model_predictions(model, train_data, test_data, exog_vars)
plot_predictions(results)

print(f"Train MAE: {results['train_mae']}")
print(f"Test MAE: {results['test_mae']}")
print(f"Overall MAE: {results['mae']}")
Train MAE: 25023.015095586925
Test MAE: 215398.15686336672
Overall MAE: 72616.80053753188
In [34]:
def make_future_predictions(model, df_to_pred, exog_vars):
    exog_test = df_to_pred[exog_vars]
    forecast = model.get_forecast(steps=len(df_to_pred), exog=exog_test)
    predicted_values = forecast.predicted_mean
    confidence_intervals = forecast.conf_int()
    predicted_df = df_to_pred.copy()
    predicted_df['viewers'] = predicted_values
    return predicted_df

df_to_pred, exog_vars = prep_for_sarimax(data_without_viewers, ['holiday', 'show_finale'])
print(f"Exogenous Variables: {exog_vars}")
predicted_df = make_future_predictions(model, df_to_pred, exog_vars)
predicted_df
Exogenous Variables: ['holiday', 'show_finale', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 'dow_7']
Out[34]:
date hour day_of_week viewers show_finale holiday finale_name dow_1 dow_2 dow_3 dow_4 dow_5 dow_6 dow_7
1464 2025-01-01 0 3 298791.954794 0 1 NaN 0 0 1 0 0 0 0
1465 2025-01-01 1 3 252381.876048 0 1 NaN 0 0 1 0 0 0 0
1466 2025-01-01 2 3 223855.235285 0 1 NaN 0 0 1 0 0 0 0
1467 2025-01-01 3 3 199042.483767 0 1 NaN 0 0 1 0 0 0 0
1468 2025-01-01 4 3 203293.737002 0 1 NaN 0 0 1 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1579 2025-01-05 19 7 565183.280929 0 0 NaN 0 0 0 0 0 0 1
1580 2025-01-05 20 7 644691.975336 0 0 NaN 0 0 0 0 0 0 1
1581 2025-01-05 21 7 634358.577117 0 0 NaN 0 0 0 0 0 0 1
1582 2025-01-05 22 7 584218.933195 0 0 NaN 0 0 0 0 0 0 1
1583 2025-01-05 23 7 399675.507313 0 0 NaN 0 0 0 0 0 0 1

120 rows × 14 columns

SAVING RESULTS IN EXCEL -¶

In [35]:
def combine_dataframes(df, df_to_pred):
    dow_cols = [col for col in df.columns if col.startswith('dow_')]
    df_clean = df.drop(columns=dow_cols, errors='ignore')
    df_to_pred_clean = df_to_pred.drop(columns=dow_cols, errors='ignore')
    final_df = pd.concat([df_clean, df_to_pred_clean], axis=0)
    return final_df

final_df = combine_dataframes(df, predicted_df)
# Exporting predictions using SARIMA
final_df.to_excel('baseline_SARIMA_simple_train_test_split_preds.xlsx', index=False)

SARIMAX - BOOTSTRAPPING¶

In [36]:
def get_train_test_partition(df, train_split_size, offset=0):
    n_samples = len(df)
    train_size = int(n_samples * train_split_size)
    start_idx = offset % (n_samples - train_size)
    return df.iloc[start_idx:start_idx+train_size], df.iloc[start_idx+train_size:min(start_idx+train_size+24, n_samples)]

def fit_model_with_method(train_data, exog_vars, order, method, options, best_params=None):
    model = SARIMAX(train_data['viewers'], exog=train_data[exog_vars] if exog_vars else None,
                    order=order, seasonal_order=(1, 1, 1, 24))
    if best_params is None:
        fitted_model = model.fit(disp=False, method=method, **options)
    else:
        fitted_model = model.fit(start_params=best_params, disp=False, method=method, **options)
    return fitted_model

def train_single_iteration(df, train_split_size, offset, exog_vars, order, methods, best_params):
    train_data, test_data = get_train_test_partition(df, train_split_size, offset)
    iteration_results, best_iter_mae, best_iter_params, best_iter_method = {}, float('inf'), best_params, None
    for method, options in methods.items():
        try:
            fitted_model = fit_model_with_method(train_data, exog_vars, order, method, options, best_params)
            predictions = get_model_predictions(fitted_model, train_data, test_data, exog_vars)
            mae = predictions['mae']
            iteration_results[f'{method}_mae'] = mae
            if mae < best_iter_mae:
                best_iter_mae, best_iter_params, best_iter_method = mae, fitted_model.params, method
        except Exception as e:
            print(f"Error with {method}: {str(e)}")
            iteration_results[f'{method}_mae'] = None
    return iteration_results, best_iter_mae, best_iter_params, best_iter_method

def make_future_predictions(model, future_df, exog_vars):
    future_exog = future_df[exog_vars] if exog_vars else None
    forecast = model.get_forecast(steps=len(future_df), exog=future_exog)
    return pd.DataFrame({'date': future_df.index,'predicted_viewers': forecast.predicted_mean,'conf_int_lower':
                         forecast.conf_int()['lower viewers'],'conf_int_upper': forecast.conf_int()['upper viewers']})

def plot_training_history(training_history):
    plt.figure(figsize=(12, 6))
    for method in ['lbfgs', 'powell', 'nm', 'cg']:
        plt.plot(training_history['iteration'], training_history[f'{method}_mae'], label=f'{method} MAE')
    plt.title('Model Performance Over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('MAE')
    plt.legend()
    plt.show()
In [39]:
def iterative_train_improve_model(df, exog_vars, iterations, order, convergence_iters, train_split_size=0.75):
   warnings.filterwarnings('ignore', category=ConvergenceWarning)
   evaluation_results = []
   best_mae = float('inf')
   best_params = None
   best_method = None
   methods = {'lbfgs': {'maxiter': convergence_iters}, 'powell': {'maxiter': convergence_iters},
              'nm': {'maxiter': convergence_iters}, 'cg': {'maxiter': convergence_iters}}
   for i in range(iterations):
       print(f"\nIteration {i+1}/{iterations}")
       print("-" * 40)
       offset = i * 24
       iter_results, iter_mae, iter_params, iter_method = train_single_iteration(df, train_split_size,
                                                                               offset, exog_vars, order, methods, best_params)
       iter_results['iteration'] = i
       evaluation_results.append(iter_results)
       if iter_mae < best_mae:
           best_mae, best_params, best_method = iter_mae, iter_params, iter_method
       print(f"Current Best Method: {best_method}")
       print(f"Current Best MAE: {best_mae:.2f}")
       print(f"All Methods MAE:")
       for method in methods:
        mae = iter_results.get(f'{method}_mae')
        if mae is not None:
            print(f"  {method:6}: {mae:.2f}")
        else:
            print(f"  {method:6}: Failed")
   final_model = SARIMAX(df['viewers'], exog=df[exog_vars] if exog_vars else None,
                        order=order, seasonal_order=(1, 1, 1, 24))
   with warnings.catch_warnings():  # Add warning filter for final fit
       warnings.simplefilter('ignore')
       final_fitted = final_model.fit(start_params=best_params, disp=False, method=best_method)
   return {'final_model': final_fitted,'training_history': pd.DataFrame(evaluation_results),'best_method': best_method}
In [40]:
n_iterations = 5
results = iterative_train_improve_model(df=df,exog_vars=exog_vars,iterations=n_iterations,order=(1,1,1),convergence_iters=100)
plot_training_history(results['training_history'])
final_predictions = make_future_predictions(results['final_model'], df_to_pred, exog_vars)
Iteration 1/5
----------------------------------------
Current Best Method: powell
Current Best MAE: 23066.08
All Methods MAE:
  lbfgs : 23198.22
  powell: 23066.08
  nm    : 24855.28
  cg    : 536311237734.27

Iteration 2/5
----------------------------------------
Current Best Method: lbfgs
Current Best MAE: 22603.62
All Methods MAE:
  lbfgs : 22603.62
  powell: 22610.85
  nm    : 22666.99
  cg    : 55977.23

Iteration 3/5
----------------------------------------
Current Best Method: powell
Current Best MAE: 21331.39
All Methods MAE:
  lbfgs : 22926.21
  powell: 21331.39
  nm    : 23031.28
  cg    : 31925.56

Iteration 4/5
----------------------------------------
Current Best Method: powell
Current Best MAE: 21242.85
All Methods MAE:
  lbfgs : 21263.10
  powell: 21242.85
  nm    : 21265.11
  cg    : 262438.95

Iteration 5/5
----------------------------------------
Current Best Method: powell
Current Best MAE: 21242.85
All Methods MAE:
  lbfgs : 22410.36
  powell: 22399.42
  nm    : 22521.87
  cg    : 57743.11
In [41]:
final_predictions
Out[41]:
date predicted_viewers conf_int_lower conf_int_upper
1464 1464 334293.523330 277984.835146 390602.211515
1465 1465 292050.242970 229931.081083 354169.404857
1466 1466 267921.090414 202187.499416 333654.681411
1467 1467 243783.591105 174852.276391 312714.905819
1468 1468 244395.769128 172442.374439 316349.163817
... ... ... ... ...
1579 1579 555275.475453 314876.675291 795674.275615
1580 1580 606395.991193 364949.873253 847842.109132
1581 1581 601332.277736 358843.270006 843821.285465
1582 1582 551390.190891 307862.172432 794918.209350
1583 1583 394676.292746 150110.534120 639242.051373

120 rows × 4 columns

In [42]:
def combine_dataframes(df, df_to_pred):
    dow_cols = [col for col in df.columns if col.startswith('dow_')]
    df_clean = df.drop(columns=dow_cols, errors='ignore')
    df_to_pred_clean = df_to_pred.drop(columns=dow_cols, errors='ignore')
    final_df = pd.concat([df_clean, df_to_pred_clean], axis=0)
    return final_df

final_df = combine_dataframes(df, final_predictions)
# Exporting predictions using SARIMA
final_df.to_excel('SARIMA_bootstrapping_preds.xlsx', index=False)