import pandas as pd
# Load the CSV file
= "/content/drive/MyDrive/2024-01-09-Bauernproteste/2024-01-11-Google-Vision-All.csv"
memespector_file = pd.read_csv(memespector_file)
df
= df[['Image_BaseName', 'GV_Label_Descriptions']]
df
# Splitting the 'GV_Label_Descriptions' into individual labels
= df['GV_Label_Descriptions'].str.split(';').apply(pd.Series, 1).stack()
split_labels = split_labels.index.droplevel(-1) # to line up with df's index
split_labels.index = 'Label'
split_labels.name
# Joining the split labels with the original dataframe
= df.join(split_labels)
df_split
# Creating a matrix of True/False values for each label per Image_BaseName
= pd.pivot_table(df_split, index='Image_BaseName', columns='Label', aggfunc=lambda x: True, fill_value=False)
matrix
# Resetting the column headers to be the label names only
= [col[1] for col in matrix.columns.values]
matrix.columns
# Now 'matrix' has a single level of column headers with only the label names
In [1]:
In [2]:
matrix
Adaptation | Advertising | Afterglow | Agricultural machinery | Agriculture | Air travel | Aircraft | Airliner | Airplane | Alloy wheel | ... | Vertebrate | Water | Water resources | Wheel | Whiskers | White | Window | Wood | Working animal | World | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Image_BaseName | |||||||||||||||||||||
6750551853789891846.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
6750761577349254405.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
6751467034741067014.jpg | False | False | False | False | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
6763591353164254469.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
6766552734108749062.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7321800737606896928.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
7321804342179204384.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
7321804909290999045.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | True | False | False | False | False | False | False |
7321806774967815457.jpg | True | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
7321806890906701089.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
982 rows × 681 columns
In [3]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
# Ensuring that 'Image_BaseName' is not part of the matrix to apply PCA
= matrix.index # Saving the image base names for later use
image_base_names = matrix.values # Convert to numpy array for PCA
label_matrix
# Dimensionality reduction using PCA
# Considering a variance ratio of 0.95 to determine the number of components
= PCA(n_components=0.95)
pca = pca.fit_transform(label_matrix)
matrix_reduced
# If needed, you can create a DataFrame from the PCA-reduced matrix and reattach the 'Image_BaseName' column
= pd.DataFrame(matrix_reduced, index=image_base_names) matrix_reduced_df
In [4]:
matrix_reduced_df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Image_BaseName | |||||||||||||||||||||
6750551853789891846.jpg | 1.392793 | -0.851573 | -0.225060 | -0.630954 | 0.345822 | -0.313126 | 0.376667 | 0.370456 | -0.012519 | -0.898472 | ... | -0.007803 | 0.022912 | -0.002782 | 0.019272 | -0.005465 | -0.005129 | 0.011833 | 0.000200 | 0.006499 | 0.010995 |
6750761577349254405.jpg | -1.045212 | 0.139963 | -0.396712 | 0.505531 | -0.186165 | 0.278001 | 0.860551 | -0.387782 | -0.041959 | 0.146992 | ... | 0.020865 | 0.027422 | 0.064993 | 0.046791 | 0.042511 | -0.040843 | -0.091713 | -0.064683 | 0.043392 | -0.045372 |
6751467034741067014.jpg | 0.364738 | 0.089808 | 0.603463 | 0.717136 | 0.084382 | 0.130516 | 0.835040 | 0.056190 | -0.175465 | -0.551632 | ... | -0.009497 | 0.144801 | -0.020713 | 0.035502 | -0.085562 | -0.169911 | 0.083582 | 0.045916 | -0.123521 | 0.032273 |
6763591353164254469.jpg | 0.657532 | -0.007257 | -0.226448 | -0.142833 | -0.615043 | -0.208217 | -0.082478 | 0.181550 | 0.899774 | 0.462160 | ... | -0.025889 | 0.006257 | 0.060421 | 0.028564 | 0.045773 | 0.000179 | 0.003499 | 0.027838 | 0.007171 | -0.051516 |
6766552734108749062.jpg | 1.638604 | -0.418596 | -0.178993 | -0.522654 | 0.663303 | -0.186928 | 1.000894 | -0.307874 | -0.172688 | 0.336597 | ... | -0.009052 | -0.002043 | 0.007575 | -0.031553 | 0.007831 | -0.005779 | -0.023599 | -0.021165 | -0.000496 | -0.006467 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7321800737606896928.jpg | -0.698156 | 0.191274 | -0.529836 | 0.047008 | 0.862388 | -0.111187 | -0.390502 | -0.089231 | 0.144091 | 0.326504 | ... | -0.015025 | -0.068188 | -0.023787 | 0.009343 | 0.004624 | 0.001396 | 0.097441 | 0.145987 | -0.102992 | 0.110626 |
7321804342179204384.jpg | 0.032051 | 0.048450 | 0.454149 | -0.012114 | 0.395014 | 0.128612 | 0.042362 | 1.019634 | -0.367217 | 1.025644 | ... | -0.002146 | -0.042328 | 0.114229 | -0.066740 | -0.051395 | -0.021397 | 0.012134 | 0.046365 | -0.005712 | 0.036329 |
7321804909290999045.jpg | 1.005015 | 0.923683 | 0.371054 | 0.533427 | 0.356759 | 0.813597 | 0.087288 | -0.289707 | 0.377865 | 1.242866 | ... | 0.005721 | 0.000672 | 0.021087 | 0.020260 | 0.037709 | 0.000290 | 0.015725 | 0.013237 | 0.018040 | -0.002060 |
7321806774967815457.jpg | -0.597974 | 0.855850 | -0.262498 | -0.214283 | -0.731812 | -0.209626 | -0.179683 | 0.529353 | -0.239506 | 0.048401 | ... | -0.012399 | 0.023383 | -0.073488 | 0.063523 | 0.013320 | 0.020351 | -0.033865 | 0.029809 | -0.080413 | -0.074329 |
7321806890906701089.jpg | -0.042383 | -0.138050 | 0.075564 | -0.396196 | 0.056236 | 0.612394 | -0.272538 | -0.230238 | -0.379339 | -0.668773 | ... | -0.106623 | -0.214393 | 0.209117 | 0.021869 | 0.220278 | 0.070092 | -0.198979 | 0.140981 | -0.004653 | -0.070667 |
982 rows × 242 columns
In [5]:
# Elbow method to determine optimal number of clusters
= []
inertia = range(1, 20) # Checking for 1 to 10 clusters
range_values
for i in range_values:
= KMeans(n_clusters=i, n_init=10, random_state=0)
kmeans
kmeans.fit(matrix_reduced_df)
inertia.append(kmeans.inertia_)
# Plotting the Elbow Curve
=(10, 6))
plt.figure(figsize='o')
plt.plot(range_values, inertia, marker'Elbow Method')
plt.title('Number of clusters')
plt.xlabel('Inertia')
plt.ylabel( plt.show()
In [11]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# Define the range of clusters to try
= range(2, 20)
range_values
= []
silhouette_scores
# Perform k-means clustering and compute silhouette scores
for i in range_values:
try:
= KMeans(n_clusters=i, n_init=10, random_state=0)
kmeans
kmeans.fit(matrix_reduced_df)= silhouette_score(matrix_reduced_df, kmeans.labels_)
score
silhouette_scores.append(score)except Exception as e:
print(f"An error occurred with {i} clusters: {e}")
# Plotting the Silhouette Scores
with plt.style.context('seaborn-whitegrid'):
=(10, 6))
plt.figure(figsize='o')
plt.plot(range_values, silhouette_scores, marker'Silhouette Method')
plt.title('Number of clusters')
plt.xlabel('Silhouette Score')
plt.ylabel( plt.show()
In [8]:
# Final k-means clustering using n clusters
= KMeans(n_clusters=11, n_init=10, random_state=0)
kmeans_final = kmeans_final.fit_predict(matrix_reduced)
clusters
# Adding the cluster information back to the original dataframe
'Cluster'] = clusters matrix[
In [12]:
# Displaying the first few rows of the dataframe with cluster information
matrix.head()
Adaptation | Advertising | Afterglow | Agricultural machinery | Agriculture | Air travel | Aircraft | Airliner | Airplane | Alloy wheel | ... | Water | Water resources | Wheel | Whiskers | White | Window | Wood | Working animal | World | Cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Image_BaseName | |||||||||||||||||||||
6750551853789891846.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | 8 |
6750761577349254405.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | 2 |
6751467034741067014.jpg | False | False | False | False | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | 6 |
6763591353164254469.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | 0 |
6766552734108749062.jpg | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | 8 |
5 rows × 682 columns
In [2]:
!unzip /content/drive/MyDrive/2024-01-09-Bauernproteste/2024-01-09-Images-Clean.zip
In [3]:
# Display the result. See linked notebook for code.