In [1]:
# imports
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [2]:
# create pandas dataframe
df = pd.read_csv("/content/drive/MyDrive/songs_normalize.csv")
df.head()
Out[2]:
artist song duration_ms explicit year popularity danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo genre
0 Britney Spears Oops!...I Did It Again 211160 False 2000 77 0.751 0.834 1 -5.444 0 0.0437 0.3000 0.000018 0.3550 0.894 95.053 pop
1 blink-182 All The Small Things 167066 False 1999 79 0.434 0.897 0 -4.918 1 0.0488 0.0103 0.000000 0.6120 0.684 148.726 rock, pop
2 Faith Hill Breathe 250546 False 1999 66 0.529 0.496 7 -9.007 1 0.0290 0.1730 0.000000 0.2510 0.278 136.859 pop, country
3 Bon Jovi It's My Life 224493 False 2000 78 0.551 0.913 0 -4.063 0 0.0466 0.0263 0.000013 0.3470 0.544 119.992 rock, metal
4 *NSYNC Bye Bye Bye 200560 False 2000 65 0.614 0.928 8 -4.806 0 0.0516 0.0408 0.001040 0.0845 0.879 172.656 pop

Code for Data Cleanup:

In [3]:
# Count of duplicates
print("Dropped", len(df[df.duplicated(keep='first')]), "duplicate rows!")
# Drop duplicates:
df.drop_duplicates(inplace=True)

# Data is complete
# print(df.isnull().sum())
Dropped 59 duplicate rows!
In [4]:
print("Dropped", len(df[df['genre'].isin(['set()'])]), "rows with genre 'set()'.")
# Drop rows with genre = 'set()'
df = df[~df['genre'].isin(['set()'])]

# Reset the index if needed
df = df.reset_index(drop=True)
Dropped 22 rows with genre 'set()'.
In [5]:
# Create list of unique genres
df['genre'] = df['genre'].str.split(', ')
# Find unique genres (flatten the list of lists and convert to a set)
unique_genres = list(set(genre for genres in df['genre'] for genre in genres))

print("Unique Genres:", unique_genres)
print("Number of Unique Genres:", len(unique_genres))
Unique Genres: ['hip hop', 'easy listening', 'country', 'classical', 'metal', 'rock', 'blues', 'R&B', 'latin', 'jazz', 'World/Traditional', 'Dance/Electronic', 'Folk/Acoustic', 'pop']
Number of Unique Genres: 14
In [6]:
# # Map every genre to a number
# genre_mapping = {genre: idx for idx, genre in enumerate(unique_genres)}
# print(genre_mapping)
In [7]:
# # Replace each genre with the genre mapping assigned
# df['genre'] = df['genre'].apply(lambda genres: sorted([genre_mapping[genre] for genre in genres]))
# Replace each boolean explicit with a numerical value
df['explicit'] = df['explicit'].apply(lambda explicit: 1 if (explicit == True) else 0)
In [8]:
# Flatten the 'genre' column
df_genre_column = df.explode('genre')
genre_counts = {}
# Loop through the rows and update genre counts
for index, row in df_genre_column.iterrows():
  if row['genre'] in genre_counts:
    genre_counts[row['genre']] += 1
  else:
    genre_counts[row['genre']] = 1

genre_counts = {k: genre_counts[k] for k in sorted(genre_counts)}

print(genre_counts)

# Create a histogram
plt.bar(genre_counts.keys(), genre_counts.values())

# Label the axes and add a title
plt.xlabel('Genres')
plt.ylabel('Song Count')
plt.title('Histogram of the number of songs of each genre:')

# Adjust the appearance of x-axis labels
plt.xticks(rotation=45, ha='right') # Rotate labels by 45 degrees

# Show the histogram
plt.show()
{'Dance/Electronic': 380, 'Folk/Acoustic': 20, 'R&B': 437, 'World/Traditional': 10, 'blues': 4, 'classical': 1, 'country': 20, 'easy listening': 7, 'hip hop': 751, 'jazz': 2, 'latin': 63, 'metal': 64, 'pop': 1582, 'rock': 225}
No description has been provided for this image

We can drop the songs which solely belong to genres Folk/Acoustic, World/Traditional, blues, classical, country, easy listening and jazz because we have very little data about songs from these genres as you can see above (song count <= 20). Our model would consider these to be outliers or provide wrong predictions for these genres. Therefore, we will work with the remaining 7 genres.

In [9]:
# List of the genres to drop
genres_to_drop = ['Folk/Acoustic','World/Traditional', 'blues', 'classical', 'country', 'easy listening', 'jazz']

# Loop through the DataFrame and change the 'genre' column
for index, row in df.iterrows():
    modified_genre = [genre for genre in row['genre'] if genre not in genres_to_drop]
    df.at[index, 'genre'] = modified_genre

# Filter out songs with empty genre lists
df = df[df['genre'].apply(len) > 0]

# Reset the index if needed
df = df.reset_index(drop=True)

# Flatten the 'genre' column
df_genre_column = df.explode('genre')
genre_counts = {}
# Loop through the rows and update genre counts
for index, row in df_genre_column.iterrows():
  if row['genre'] in genre_counts:
    genre_counts[row['genre']] += 1
  else:
    genre_counts[row['genre']] = 1

genre_counts = {k: genre_counts[k] for k in sorted(genre_counts)}
unique_genres = list(genre_counts.keys())
print(genre_counts)

# Create a histogram
plt.bar(genre_counts.keys(), genre_counts.values())

# Label the axes and add a title
plt.xlabel('Genres')
plt.ylabel('Song Count')
plt.title('Updated Histogram (song count > 20) :')

# Adjust the appearance of x-axis labels
plt.xticks(rotation=45, ha='right') # Rotate labels by 45 degrees

# Show the histogram
plt.show()
{'Dance/Electronic': 380, 'R&B': 437, 'hip hop': 751, 'latin': 63, 'metal': 64, 'pop': 1582, 'rock': 225}
No description has been provided for this image

We can drop the artist, song, duration_ms, year, popularity, genre columns:

artist, song: Both of these columns do not provide any information in terms of the audio quality or sound characteristics and hence are not required for genre classification.

duration_ms: The duration of the song does not give us any significant insight for genre classification.

year, popularity: Year of release and popularity ranking of the song may be important information regarding historic music trends and genre popularity but these features are not required for genre classification.

In [10]:
try:
  df = df.drop(['artist','song','duration_ms','year','popularity'], axis=1)
except:
  pass
# We need to remove the genre column later to apply our unsupervised learning models.
df.head()
Out[10]:
explicit danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo genre
0 0 0.751 0.834 1 -5.444 0 0.0437 0.3000 0.000018 0.3550 0.894 95.053 [pop]
1 0 0.434 0.897 0 -4.918 1 0.0488 0.0103 0.000000 0.6120 0.684 148.726 [rock, pop]
2 0 0.529 0.496 7 -9.007 1 0.0290 0.1730 0.000000 0.2510 0.278 136.859 [pop]
3 0 0.551 0.913 0 -4.063 0 0.0466 0.0263 0.000013 0.3470 0.544 119.992 [rock, metal]
4 0 0.614 0.928 8 -4.806 0 0.0516 0.0408 0.001040 0.0845 0.879 172.656 [pop]

We will now Normalize all the entries in the dataframe using MinMaxScaler which will transform all the entries to values between -1 and 1 based on the min and max values of the column.

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Initializing the MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
# We have to scale every column except the last one
df_to_scale = df.iloc[:,:-1]
# Normalizing all the entries in the DataFrame
df_normalized = pd.DataFrame(scaler.fit_transform(df_to_scale), columns=df_to_scale.columns)
df_normalized['genre'] = df['genre']

df_normalized.head()
Out[11]:
explicit danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo genre
0 -1.0 0.470449 0.650461 -0.818182 0.489278 -1.0 -0.925832 -0.385273 -0.999964 -0.197835 0.830998 -0.535457 [pop]
1 -1.0 -0.278960 0.783921 -1.000000 0.541259 1.0 -0.907381 -0.978932 -1.000000 0.420325 0.381752 0.176236 [rock, pop]
2 -1.0 -0.054374 -0.065565 0.272727 0.137168 1.0 -0.979016 -0.645524 -1.000000 -0.447986 -0.486790 0.018882 [pop]
3 -1.0 -0.002364 0.817816 -1.000000 0.625754 -1.0 -0.915340 -0.946145 -0.999973 -0.217078 0.082255 -0.204771 [rock, metal]
4 -1.0 0.146572 0.849592 0.454545 0.552327 -1.0 -0.897250 -0.916431 -0.997888 -0.848467 0.798909 0.493542 [pop]
In [12]:
# Explode the 'genre' column to create a row for each genre
expanded_df = df_normalized.explode('genre')
# Calculate the mean values for each genre
df_genre_means = pd.pivot_table(expanded_df, index='genre', values=list(expanded_df.columns), aggfunc='mean').reset_index()
print("Mean values of each feature for every genre in the dataframe.\n".center(150, " "))
df_genre_means
                                            Mean values of each feature for every genre in the dataframe.
                                            
Out[12]:
genre acousticness danceability energy explicit instrumentalness key liveness loudness mode speechiness tempo valence
0 Dance/Electronic -0.778955 0.273373 0.511615 -0.768421 -0.922069 0.042105 -0.607016 0.518938 0.015789 -0.799113 -0.151350 0.069485
1 R&B -0.690498 0.325500 0.286296 -0.432494 -0.986500 -0.032245 -0.657804 0.440895 0.038902 -0.668797 -0.261606 0.125626
2 hip hop -0.761480 0.393342 0.366361 0.105193 -0.987489 0.017068 -0.597393 0.472718 0.083888 -0.534101 -0.230440 0.121160
3 latin -0.688925 0.409134 0.555165 -0.809524 -0.998660 -0.004329 -0.582170 0.564235 0.174603 -0.739887 -0.225323 0.398119
4 metal -0.916893 -0.103354 0.644470 -0.718750 -0.989436 -0.068182 -0.594284 0.547491 0.250000 -0.912898 -0.135625 -0.129777
5 pop -0.730154 0.281021 0.400245 -0.496839 -0.976530 -0.027238 -0.618143 0.481590 0.096081 -0.719598 -0.207708 0.105280
6 rock -0.848472 0.009309 0.558512 -0.822222 -0.938355 -0.049697 -0.581506 0.518222 0.262222 -0.868746 -0.123047 0.045036

We have a list of genres for many songs but our unsupervised learning model would categorize the song to a particular genre. Hence, we have to provide one genre label to each song. We will do this by using the mean values we calculated above for each genre. By taking the euclidean distance between each song's audio feature values and the mean values from above, we will choose the genre that best represents a song. Consequently, we will remove all other genres from that song's genre list.

In [13]:
# Loop through the DataFrame and change the 'genre' column
for index, row in df_normalized.iterrows():
  dists = []
  for genre in row['genre']:
    genre_features = df_genre_means[df_genre_means['genre'] == genre].iloc[:,1:]
    song_features = df_normalized.iloc[index,:-1]
    # Create numpy arrays of feature values to find euclidean distance
    genre_values = genre_features[df_normalized.iloc[:,:-1].columns].values[0]
    song_values = song_features[df_normalized.iloc[:,:-1].columns].values
    # Calculate the Euclidean distance
    euclidean_dist = np.linalg.norm(song_values - genre_values)
    # Append to the dists array
    dists.append((genre, euclidean_dist))
  if index < 5:
    print("Array of Euclidean Distances: ", dists)
    print("Selected Genre: ", min(dists, key=lambda x: x[1])[0])
  # Get the genre with the minimum distance and update the 'genre' column of the dataframe
  df_normalized.at[index, 'genre'] = min(dists, key=lambda x: x[1])[0]
df_normalized.head()
Array of Euclidean Distances:  [('pop', 1.7752394429909757)]
Selected Genre:  pop
Array of Euclidean Distances:  [('rock', 1.6859303351942319), ('pop', 1.9699797398924441)]
Selected Genre:  rock
Array of Euclidean Distances:  [('pop', 1.4540178813055917)]
Selected Genre:  pop
Array of Euclidean Distances:  [('rock', 1.6622885702272139), ('metal', 1.6580877882930478)]
Selected Genre:  metal
Array of Euclidean Distances:  [('pop', 1.7332948990314008)]
Selected Genre:  pop
Out[13]:
explicit danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo genre
0 -1.0 0.470449 0.650461 -0.818182 0.489278 -1.0 -0.925832 -0.385273 -0.999964 -0.197835 0.830998 -0.535457 pop
1 -1.0 -0.278960 0.783921 -1.000000 0.541259 1.0 -0.907381 -0.978932 -1.000000 0.420325 0.381752 0.176236 rock
2 -1.0 -0.054374 -0.065565 0.272727 0.137168 1.0 -0.979016 -0.645524 -1.000000 -0.447986 -0.486790 0.018882 pop
3 -1.0 -0.002364 0.817816 -1.000000 0.625754 -1.0 -0.915340 -0.946145 -0.999973 -0.217078 0.082255 -0.204771 metal
4 -1.0 0.146572 0.849592 0.454545 0.552327 -1.0 -0.897250 -0.916431 -0.997888 -0.848467 0.798909 0.493542 pop

We need to reduce the number of the classes for our unsupervised learning models and therefore we will use the euclidean distances between genre feature means to find similar genres.

In [14]:
from itertools import combinations

# Calculate distances between all pairs of rows
distances = []
for pair in combinations(df_genre_means.iterrows(), 2):
    row1, row2 = pair[0][1], pair[1][1]
    genre1_data = row1[df_genre_means.iloc[:,1:].columns].values
    genre2_data = row2[df_genre_means.iloc[:,1:].columns].values
    dist =  np.linalg.norm(genre1_data - genre2_data)
    if dist < 0.35:
      distances.append((row1['genre'], row2['genre'], dist))

# Create a DataFrame to store the distances
distance_df = pd.DataFrame(distances, columns=['Genre1', 'Genre2', 'Euclidean Distance'])
distance_df
Out[14]:
Genre1 Genre2 Euclidean Distance
0 Dance/Electronic pop 0.339293
1 R&B pop 0.182204
2 metal rock 0.268807

The genres above have similar mean values but I am not sure if the euclidean distance is a good measure to eliminate labels because due to many features we have multidimensional data.

In [15]:
# # Code to find similarity of features for every genre based on feature means

# # Create a dictionary to map genres to colors
# genre_colors = {'Dance/Electronic': 'red', 'Folk/Acoustic': 'blue', 'R&B': 'green', 'country':'black','hip hop':'yellow',\
#                 'latin':'grey','metal':'violet','pop':'indigo','rock':'orange'}

# # Plot the points with different colors and labels
# for label, color in genre_colors.items():
#     subset = df_genre_means[df_genre_means['genre'] == label]
#     plt.scatter(subset['acousticness'], subset['explicit'], c=color, label=label)

# # Add labels and legend
# plt.xlabel('acousticness')
# plt.ylabel('explicit')
# plt.legend()
# # Show the plot
# plt.show()
In [16]:
corr_mat = df_normalized.iloc[:,:-1].corr()
plt.subplots(figsize=(12,9))
plt.title('\nCorrelation between Features:\n', fontsize = 20)
sns.heatmap(corr_mat, cmap='coolwarm', annot=True)
plt.show()
No description has been provided for this image
In [17]:
# Code to print out highly correlated features
thresholds = [0.8, 0.6, 0.4]  # Adjust the threshold as needed

# Took the upper triangular part of the correlation matrix
upper_triangular = np.triu(corr_mat, k=1)

# Highly correlated feature pairs
for threshold in thresholds:
  high_corr_pairs = [(corr_mat.columns[i], corr_mat.columns[j]) for i in range(upper_triangular.shape[0]) \
                     for j in range(i+1, upper_triangular.shape[1]) if abs(upper_triangular[i, j]) > threshold]

  print(threshold, high_corr_pairs)
0.8 []
0.6 [('energy', 'loudness')]
0.4 [('explicit', 'speechiness'), ('danceability', 'valence'), ('energy', 'loudness'), ('energy', 'acousticness')]

The correlation doesn't seem to be high between features and so we will move forward with all the features.

We will now split the data into the training (80%) and the test set (20%). The genre column will be the class labels to classify the songs to.

In [18]:
from sklearn.model_selection import train_test_split

X = df_normalized.iloc[:,:-1]
y = df_normalized.iloc[:,-1:]

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
cols = xtrain.columns

# To deal with outliers, we will use robust scaler to scale our data
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

xtrain = pd.DataFrame(xtrain, columns = cols)
xtest = pd.DataFrame(xtest, columns = cols)

xtrain.describe()
Out[18]:
explicit danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo
count 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000 1526.000000
mean -0.433814 0.264940 0.408104 -0.020732 0.515861 0.106160 -0.711363 -0.739947 -0.969047 -0.612605 0.087629 -0.197375
std 0.901298 0.335521 0.325470 0.654970 0.199157 0.994675 0.345957 0.349806 0.180313 0.340728 0.475237 0.360126
min -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000
25% -1.000000 0.056738 0.199767 -0.636364 0.415976 -1.000000 -0.940937 -0.970531 -1.000000 -0.836621 -0.262167 -0.482968
50% -1.000000 0.289598 0.442856 0.090909 0.537676 1.000000 -0.865051 -0.887742 -1.000000 -0.746242 0.103647 -0.204201
75% 1.000000 0.491726 0.658405 0.454545 0.653560 1.000000 -0.624457 -0.639377 -0.999851 -0.472038 0.466788 0.008105
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [19]:
xtest.describe()
Out[19]:
explicit danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo
count 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000 382.000000
mean -0.460733 0.307626 0.426807 -0.021894 0.525660 0.089005 -0.682319 -0.740411 -0.973066 -0.617001 0.143655 -0.228909
std 0.888703 0.318667 0.313753 0.663250 0.182761 0.997337 0.362766 0.352723 0.155671 0.339539 0.460260 0.345276
min -1.000000 -0.879433 -0.563394 -1.000000 -0.233134 -1.000000 -0.997467 -0.999997 -1.000000 -0.988455 -0.865440 -0.962117
25% -1.000000 0.120567 0.220422 -0.636364 0.434131 -1.000000 -0.930807 -0.975500 -1.000000 -0.844378 -0.203872 -0.521169
50% -1.000000 0.321513 0.460862 -0.090909 0.550319 1.000000 -0.846599 -0.879443 -1.000000 -0.753458 0.151781 -0.204638
75% 1.000000 0.538416 0.673763 0.454545 0.647492 1.000000 -0.529486 -0.670627 -0.999899 -0.467829 0.520270 -0.071596
max 1.000000 0.985816 0.957632 1.000000 1.046930 1.000000 0.667149 0.952868 0.524873 0.874925 0.997861 0.881789

The LabelEncoder converts all the data into numeric values, making it computationally easy to use.

In [20]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

ytrain = LE.fit_transform(ytrain)
ytest = LE.transform(ytest)

X = pd.concat([xtrain, xtest], axis=0)
y = pd.concat([pd.DataFrame(ytrain), pd.DataFrame(ytest)], axis=0)

y_train = LE.inverse_transform(ytrain)
y_test = LE.inverse_transform(ytest)

y_original = pd.concat([pd.DataFrame(y_train), pd.DataFrame(y_test)], axis=0)
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_label.py:116: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_label.py:134: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)

Now we will be running PCA on the dataframe in order to reduce the number of dimensions and remove highly correlated data while retaining most information.

PCA uses an orthogonal transformation to convert highly correlated data into a lower dimensional linear space that:

  • Maximizes variance of projected data
  • Minimizes mean squared distance between data point and projections

The newly obtained set of values have low correlation and are called principal components. There are three common approaches to finding the number of principal components:

  • Eigenvalue criterion
  • Proportion of variance explained criterion
  • Scree plot criterion

We will be using the Scree Plot criterion. A Scree Plot is a simple line segment plot that shows the eigenvalues for each individual PC. It shows the eigenvalues on the y-axis and the number of factors on the x-axis. It always displays a downward curve. The scree plots generally start high on the left, and fall rather quickly eventually flattening out. The first component explains high variance, the next few components explain moderate variance, and the rest components explain low variance. The scree plot criterion looks for the “elbow” in the curve and selects all components right before the line flattens out.

In [21]:
from sklearn.decomposition import PCA

cov_matrix_PCA = PCA(n_components=len(X.columns))
cov_matrix_PCA.fit(X)
Out[21]:
PCA(n_components=12)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA(n_components=12)
In [22]:
plt.xlabel('Number of Features')
plt.ylabel('Eigenvalues')
plt.title('PCA Eigenvalues')
plt.ylim(0, max(cov_matrix_PCA.explained_variance_))
plt.style.context('seaborn-whitegrid')
plt.axhline(y=1, color='r', linestyle='-.')
plt.plot(cov_matrix_PCA.explained_variance_)
plt.show()
No description has been provided for this image

Kaiser Rule: Select principal components with eigenvalues of at least 1.

This rule provides a simple and straightforward approach to selecting PCAs. However, for our case it is telling us to select only 1 principal component which may not be ideal. This probably implies that our data has varying variances across dimensions.

In [23]:
var = cov_matrix_PCA.explained_variance_ratio_
var = np.cumsum(np.round(cov_matrix_PCA.explained_variance_ratio_, decimals=3)*100)
plt.xlabel('Number of Features')
plt.ylabel('% of variance explained')
plt.title('PCA Variance Explained')
plt.ylim(min(var), 100.5)
plt.style.context('seaborn-whitegrid')
plt.axhline(y=80, color='r', linestyle='-.')
plt.axvline(x=3, color='r', linestyle='-.')
plt.plot(var)
plt.show()
No description has been provided for this image

Proportion of Variance Plot: This approach involves looking at the cumulative proportion of variance explained by each PC and selecting a number of PCs that can collectively describe 80% of the total variance. This method allows us to capture a substantial amount of information while potentially reducing dimensionality effectively. It is more flexible and shows that we should select 3 principal components. Thus we will move forward with 3 PCs.

In [24]:
# Run PCA with 3 principal components
pca = PCA(n_components=3)
X_PCA = pca.fit_transform(X, y)

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Defined a colormap for assigning colors based on labels
cmap = plt.get_cmap('tab10')
# Scatter plot with the first three PCA components
scatter = ax.scatter(X_PCA[:, 0], X_PCA[:, 1], X_PCA[:, 2], c=y, cmap=cmap)

# Created a custom colorbar with labels based on unique classes
cbar = plt.colorbar(scatter, ax=ax, ticks=np.unique(y))
cbar.set_ticklabels(np.unique(y_original))
cbar.set_label('Predicted Labels')

# Add labels and title
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
plt.title('PCA 3D Scatter Plot:')
ax.view_init(elev=20, azim=60)

plt.show()
No description has been provided for this image

Now we will be running SVD on the dataframe which provides us with a reduced-rank optimization of the data. We will perform linear dimensionality reduction by means of truncated singular value decomposition.

In [25]:
from sklearn.decomposition import TruncatedSVD

cov_matrix_SVD = TruncatedSVD(n_components=len(X.columns))
cov_matrix_SVD.fit(X)
Out[25]:
TruncatedSVD(n_components=12)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
TruncatedSVD(n_components=12)
In [26]:
var = cov_matrix_SVD.explained_variance_ratio_
var = np.cumsum(np.round(cov_matrix_SVD.explained_variance_ratio_, decimals=3)*100)
plt.xlabel('Number of Features')
plt.ylabel('% of variance explained')
plt.title('SVD Variance Explained')
plt.ylim(min(var), 100.5)
plt.style.context('seaborn-whitegrid')
plt.axhline(y=80, color='r', linestyle='-.')
plt.axvline(x=4, color='r', linestyle='-.')
plt.plot(var)
plt.show()
No description has been provided for this image

Proportion of Variance Plot: This approach involves looking at the cumulative proportion of variance explained by each component and selecting the number of components that can collectively describe 80% of the total variance. This method allows us to capture a substantial amount of information while potentially reducing dimensionality effectively. It is more flexible and shows that we should select 4 components. Thus we will move forward with 4 components.

In [27]:
# Run Truncated SVD with 4 components
svd = TruncatedSVD(n_components=4)
X_SVD = svd.fit_transform(X, y)

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Defined a colormap for assigning colors based on labels
cmap = plt.get_cmap('tab10')
# Scatter plot with the first three SVD components
scatter = ax.scatter(X_SVD[:, 0], X_SVD[:, 1], X_SVD[:, 2], c=y, cmap=cmap)

# Created a custom colorbar with labels based on unique classes
cbar = plt.colorbar(scatter, ax=ax, ticks=np.unique(y))
cbar.set_ticklabels(np.unique(y_original))
cbar.set_label('Predicted Labels')

# Add labels and title
ax.set_xlabel('SVD Component 1')
ax.set_ylabel('SVD Component 2')
ax.set_zlabel('SVD Component 3')
plt.title('SVD 3D Scatter Plot:')
ax.view_init(elev=20, azim=60)

plt.show()
No description has been provided for this image

This scatterplot represents 3 of the 4 Truncated SVD components.

In [28]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

cov_matrix_LDA = LinearDiscriminantAnalysis(n_components=len(np.unique(y_original)) - 1)
X_LDA = cov_matrix_LDA.fit_transform(X, y.values.ravel())

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Defined a colormap for assigning colors based on labels
cmap = plt.get_cmap('tab10')
# Scatter plot with the first three LDA components
scatter = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], X_LDA[:, 2], c=y, cmap=cmap)

# Created a custom colorbar with labels based on unique classes
cbar = plt.colorbar(scatter, ax=ax, ticks=np.unique(y))
cbar.set_ticklabels(np.unique(y_original))
cbar.set_label('Predicted Labels')

# Add labels and title
ax.set_xlabel('LDA Component 1')
ax.set_ylabel('LDA Component 2')
ax.set_zlabel('LDA Component 3')
plt.title('LDA 3D Scatter Plot:')
ax.view_init(elev=20, azim=60)

plt.show()
No description has been provided for this image
In [29]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
X_TSNE = tsne.fit_transform(X, y)

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Defined a colormap for assigning colors based on labels
cmap = plt.get_cmap('tab10')
# Scatter plot with the first three t-SNE components
scatter = ax.scatter(X_TSNE[:, 0], X_TSNE[:, 1], X_TSNE[:, 2], c=y, cmap=cmap)

# Create a custom colorbar with labels based on unique classes
cbar = plt.colorbar(scatter, ax=ax, ticks=np.unique(y))
cbar.set_ticklabels(np.unique(y_original))
cbar.set_label('Predicted Labels')

# Add labels and title
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
plt.title('t-SNE 3D Scatter Plot:')
ax.view_init(elev=20, azim=60)

plt.show()
No description has been provided for this image

Now we are ready to run our models. Since we are dealing with a classification problem, we will try these models:

  • Random Forest
  • Naive Bayes
  • Stochastic Gradient Descent Classifier
  • Logistic Regression
In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
In [31]:
# We can add parameters as required.
models = [['Naive Bayes', GaussianNB()], ['SGD', OneVsOneClassifier(SGDClassifier())], ['Logistic', LogisticRegression(multi_class = 'ovr')]]

# Random Forest has to be done separately because it needs binary encoded labels.
RF_clf = RandomForestClassifier(random_state=42, min_samples_split = 5)

res = []
kfold = StratifiedKFold(n_splits=10, random_state=13, shuffle=True)
bin_encoded_ytrain = label_binarize(ytrain, classes=np.unique(ytrain))

RF_cross_val_score = cross_val_score(RF_clf, xtrain, bin_encoded_ytrain, cv=10, scoring='accuracy')
print("Random Forest ->")
print("CrossVal Score Mean:    ", RF_cross_val_score.mean())
print("CrossVal Score Std:    ", RF_cross_val_score.std())

for name, model in models:
    cv_score = cross_val_score(model, xtrain, y_train, cv = kfold, scoring = 'accuracy')
    res.append(cv_score)
    print("")
    print(name,"->")
    print("CrossVal Score Mean:    ", cv_score.mean())
    print("CrossVal Score Std:    ", cv_score.std())
Random Forest ->
CrossVal Score Mean:     0.44494324045407635
CrossVal Score Std:     0.01684579452116015

Naive Bayes ->
CrossVal Score Mean:     0.44028637770897827
CrossVal Score Std:     0.06299859936903071

SGD ->
CrossVal Score Mean:     0.5655228758169935
CrossVal Score Std:     0.045086671245315006

Logistic ->
CrossVal Score Mean:     0.5995700034399725
CrossVal Score Std:     0.02942973985870683

All of our models have very low accuracy because we have a lack of data. Moreover, the current data is not a good representation of the different genre classes. It appears that Logistic Regression has the highest accuracy (59.6%). We will now calculate the recall and precision values. Then, we will calculate the F1 score based on that.

In [32]:
res_precision_recall = []

y_pred_RF = cross_val_predict(RF_clf, xtrain, bin_encoded_ytrain, cv = 10)
res_precision_recall += [['Random Forest', precision_score(bin_encoded_ytrain, y_pred_RF, average = "micro"), \
                             recall_score(bin_encoded_ytrain, y_pred_RF, average = "micro")]]

RF_precision = precision_score(bin_encoded_ytrain, y_pred_RF, average = "micro")
RF_recall = recall_score(bin_encoded_ytrain, y_pred_RF, average = "micro")
RF_F1 = 2 * (RF_precision * RF_recall) / (RF_precision + RF_recall)
print('%s -> %s: %f     %s: %f    %s: %f' % ('Random Forest', 'Precision Score', RF_precision, 'Recall Score', RF_recall, 'F1 Score', RF_F1))

for name, model in models:
    y_pred = cross_val_predict(model, xtrain, y_train, cv = kfold)
    precision = precision_score(y_train, y_pred, average = "micro")
    recall = recall_score(y_train, y_pred, average = "micro")
    # storing the precision and recall values
    res_precision_recall += [[name , precision, recall]]
    model_f1_score = 2 * (precision * recall) / (precision + recall)
    print("")
    print('%s -> %s: %f     %s: %f    %s: %f' % (name, 'Precision Score', precision, 'Recall Score', recall, 'F1 Score', model_f1_score))
Random Forest -> Precision Score: 0.723110     Recall Score: 0.444954    F1 Score: 0.550913

Naive Bayes -> Precision Score: 0.440367     Recall Score: 0.440367    F1 Score: 0.440367

SGD -> Precision Score: 0.549148     Recall Score: 0.549148    F1 Score: 0.549148

Logistic -> Precision Score: 0.599607     Recall Score: 0.599607    F1 Score: 0.599607

Logistic Regression has the best F1 score (59.6) followed by Random Forest Classifier, then SGD and Naive Bayes. Now we will run these models on the test data.

In [33]:
precision_scores = []
recall_scores = []
F1_scores = []

RF_clf.fit(xtrain, ytrain)
RF_y_pred = RF_clf.predict(xtest)
precision_scores.append(precision_score(ytest, RF_y_pred, average='weighted', zero_division=1))
recall_scores.append(recall_score(ytest, RF_y_pred, average='weighted'))
F1_scores.append(f1_score(ytest, RF_y_pred, average='weighted'))
print("Random Forest Precision Score: ", precision_scores[0])
print("Random Forest Recall Score: ", recall_scores[0])
print('Random Forest F1 Score: ', F1_scores[0])

for name, model in models:
  model_pred = (model.fit(xtrain, ytrain)).predict(xtest)
  precision_scores.append(precision_score(ytest, model_pred, average='weighted', zero_division=1))
  recall_scores.append(recall_score(ytest, model_pred, average='weighted'))
  F1_scores.append(f1_score(ytest, model_pred, average='weighted'))
  print("")
  print(name, "Precision Score: ", precision_scores[-1])
  print(name, "Recall Score: ", recall_scores[-1])
  print(name, "F1 Score: ", F1_scores[-1])

width = 0.2
x = np.arange(4)

plt.bar(x - width, F1_scores, width, label='F1 Score')
plt.bar(x, precision_scores, width, label='Precision')
plt.bar(x + width, recall_scores, width, label='Recall')

plt.xlabel("Models")
plt.ylabel("Scores")
plt.title("Comparison of Model Performance")
plt.xticks(x, ['Random Forest', 'Naive Bayes', 'SGD', 'Logistic'])
plt.legend()
plt.show()
Random Forest Precision Score:  0.6629803828380919
Random Forest Recall Score:  0.6204188481675392
Random Forest F1 Score:  0.571231343329644

Naive Bayes Precision Score:  0.5269295384862275
Naive Bayes Recall Score:  0.40575916230366493
Naive Bayes F1 Score:  0.42457614276887556

SGD Precision Score:  0.588996202472187
SGD Recall Score:  0.5785340314136126
SGD F1 Score:  0.4955796059685683

Logistic Precision Score:  0.6073080972690835
Logistic Recall Score:  0.5759162303664922
Logistic F1 Score:  0.5034886291294335
No description has been provided for this image
In [34]:
# kmeans = KMeans(n_clusters=len(np.unique(y)), random_state=42, init='k-means++', n_init="auto").fit(xtrain)
# y_pred = kmeans.predict(xtest)

wcss = []
# Suppress the FutureWarning
with warnings.catch_warnings():
  warnings.filterwarnings("ignore", category=FutureWarning)

  for i in range(1, 25):
    kmeans = KMeans(i)
    kmeans.fit(xtrain)
    wcss.append(kmeans.inertia_)

# print("Accuracy:", accuracy_score(ytest, y_pred)*100, "%")
# cm = confusion_matrix(ytest, y_pred)
# sns.heatmap(cm, annot=True)
In [35]:
num_clusters = range(1,25)
plt.plot(num_clusters, wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("Within-cluster Sum of Squares")
plt.axvline(x=7, color='r', linestyle='-.')
plt.show()
print("This suggests that we can move forward with 7 clusters for KMeans.")
No description has been provided for this image
This suggests that we can move forward with 7 clusters for KMeans.
In [36]:
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# kfold.get_n_splits(X)

# ADA_BC_clf = AdaBoostClassifier(n_estimators=250, random_state=0)
# scores = []
# for i, (train_index, test_index) in enumerate(kfold.split(X)):
#   X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
#   y_train, y_test = y.iloc[train_index], y[test_index]
#   # Training and evaluating on each fold
#   ADA_BC_clf.fit(xtrain, ytrain)
#   y_pred = ADA_BC_clf.predict(xtest)
#   scores.append(ADA_BC_clf.score(xtest, ytest))
# print(scores)
In [37]:
with warnings.catch_warnings():
  warnings.filterwarnings("ignore", category=FutureWarning)
  kmeans = KMeans(n_clusters=7, init='k-means++', max_iter=100, random_state=42).fit(xtrain)
print(kmeans.labels_, ytrain)

y_pred = kmeans.predict(xtest)

print("Accuracy:", accuracy_score(ytest, y_pred)*100, "%")
cm = confusion_matrix(ytest, y_pred)
sns.heatmap(cm, annot=True)
[0 5 1 ... 6 1 5] [2 5 5 ... 5 5 6]
Accuracy: 12.041884816753926 %
Out[37]:
<Axes: >
No description has been provided for this image
In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Create a Lasso model with a specific alpha (regularization strength)
lasso = Lasso(alpha=0.1)

# Fit the Lasso model to the training data
lasso.fit(xtrain, ytrain)
# Make predictions on the test data
y_pred = lasso.predict(xtest)

# Evaluate the model's performance (e.g., using Mean Squared Error)
mse = mean_squared_error(ytest, y_pred)
print(f"Mean Squared Error: {mse}")
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()
Mean Squared Error: 4.110939282210901
No description has been provided for this image
In [39]:
# Get feature coefficients from the Lasso model
feature_importance = lasso.coef_

# Create a bar plot to show feature importance
sns.barplot(x=feature_importance, y=X.columns)
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show()
No description has been provided for this image
In [40]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import plotly.express as px

dataf = df_normalized
with warnings.catch_warnings():
  warnings.filterwarnings("ignore", category=FutureWarning)
  cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=7))])
  A = dataf.select_dtypes(np.number)
  cluster_pipeline.fit(A)
  dataf['cluster'] = cluster_pipeline.predict(A)

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(A)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)

projection['genre'] = dataf['genre']
projection['cluster'] = dataf['cluster']

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genre'])
fig.show()
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1908 samples in 0.002s...
[t-SNE] Computed neighbors for 1908 samples in 0.122s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1908
[t-SNE] Computed conditional probabilities for sample 1908 / 1908
[t-SNE] Mean sigma: 1.015253
[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.023224
[t-SNE] KL divergence after 1000 iterations: 1.370458