from sklearn.datasets import make_blobs import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from mpl_toolkits.mplot3d.art3d import Line3DCollection import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn import datasets from sklearn.metrics.pairwise import cosine_distances from sklearn.manifold import MDS import numpy as np import torch from scipy.spatial import distance_matrix import matplotlib.pyplot as plt import time import seaborn as sns import matplotlib.pyplot as plt import pandas as pd import numpy as np import numpy as np import pandas as pd from sklearn.metrics import silhouette_score, euclidean_distances, pairwise_distances from sklearn.neighbors import NearestNeighbors from sklearn.metrics.pairwise import cosine_distances, manhattan_distances from scipy.stats import pearsonr from sklearn.manifold import MDS import numpy as np import pandas as pd from sklearn.metrics import silhouette_score, euclidean_distances from sklearn.neighbors import NearestNeighbors from scipy.stats import pearsonr from sklearn.metrics.pairwise import cosine_distances, manhattan_distances from sklearn.manifold import MDS import skdim import torch from sklearn.metrics.pairwise import cosine_distances import os def d_cos_distance_matrix(D): return cosine_distances(D) / 2 def d_circ_distance_matrix_torch(P): m = torch.cdist(P, P) return 2 * torch.fmin(m, 1 - m) def loss_torch(M, P): return torch.sum(torch.abs(M - d_circ_distance_matrix_torch(P))) / P.shape[0]**2 def gradient_descent(D, n_iter=100, learning_rate=0.1): import time as ti start = ti.time() H = [] M = torch.from_numpy(d_cos_distance_matrix(D)) P = torch.reshape(torch.rand(D.shape[0]), (D.shape[0], 1)) P.requires_grad_() optimizer = torch.optim.Adam([P], lr=learning_rate) for i in range(n_iter): optimizer.zero_grad() loss = loss_torch(M, P) H.append(loss.detach().clone()) loss.backward(retain_graph=True) optimizer.step() # Basic Gradient Descent only for use without optimizer # Requires update of parameters #with torch.no_grad(): # init.sub_(init.grad * learning_rate) # init.grad.zero_() d = pd.DataFrame(data=P.detach().clone(), columns=['ori']) d['x'] = d['ori'].apply(lambda x: np.cos(x * 2 * np.pi)) d['y'] = d['ori'].apply(lambda x: np.sin(x * 2 * np.pi)) d['target'] = y return { "data_prepared": d, "points": P.detach().clone(), "loss": loss.detach().clone(), "loss_history": H, "time": ti.time() - start } def apply_custom_projection(X,y, n_iter=100, learning_rate=0.01): import time as tim def torch_loss(x, t_hd_dist_mat): m = torch.cdist(x, x) return torch.mean(torch.abs(t_hd_dist_mat - 2*torch.min(m, 1 - m))) N, _ = X.shape hd_dist_mat = distance_matrix(X, X) / 2 # cosine! t_hd_dist_mat = torch.from_numpy(hd_dist_mat) init = torch.rand(N, 1, requires_grad=True) optimizer = torch.optim.Adam([init], lr=learning_rate) loss_history = [] import time as time_module start = time_module.time() for i in range(n_iter): optimizer.zero_grad() loss = torch_loss(init, t_hd_dist_mat) loss.backward() optimizer.step() loss_history.append(loss.item()) print(f"Final Loss: {loss.item()}") print(f"Optimization Time: {tim.time() - start:.2f} seconds") d = pd.DataFrame(data=init.detach().numpy(), columns=['ori']) d['x'] = d['ori'].apply(lambda x: np.cos(x * 2 * np.pi)) d['y'] = d['ori'].apply(lambda x: np.sin(x * 2 * np.pi)) d['target'] = y return d, loss_history from sklearn.preprocessing import StandardScaler def apply_2d_mds(X): scaler = StandardScaler() X_std = scaler.fit_transform(X) # Standardized features mds = MDS(n_components=2, random_state=777) return mds.fit_transform(X_std) def apply_1d_mds(X): scaler = StandardScaler() X_std = scaler.fit_transform(X) # Standardized features mds = MDS(n_components=1, random_state=777) return mds.fit_transform(X_std) def visualize_cPro(d,y): d['target'] = d['target'].astype('category') plt.figure(figsize=(8, 8)) scatter = sns.scatterplot(data=d, x='x', y='y', hue='target', palette='husl', edgecolor='white', linewidth=0.2, legend=False) plt.gca().set_aspect('equal', 'box') plt.gca().set_facecolor('white') plt.xlabel("x") plt.ylabel("y") plt.xlim(-1.1, 1.1) plt.ylim(-1.1, 1.1) plt.show() def visualize_2d_mds(X_2d, y): df_2d = pd.DataFrame(X_2d, columns=['Component 1', 'Component 2']) df_2d['Target'] = y # Determine the number of unique categories n_categories = df_2d['Target'].nunique() # Option 1: Use a categorical palette that supports a large number of categories palette = sns.color_palette("husl", n_categories) plt.figure(figsize=(8, 8)) sns.scatterplot(data=df_2d, x='Component 1', y='Component 2', hue='Target', palette=palette, linewidth=0.2, edgecolor='white', legend=False) plt.title('2D MDS Projection') plt.xlabel('Component 1') plt.ylabel('Component 2') # plt.legend().set_title('Target') plt.show() def visualize_1d_mds(X_1d, y): df_1d = pd.DataFrame(X_1d, columns=['Component 1']) df_1d['y'] = 0 # Adding a dummy component for visualization df_1d['Target'] = y n_categories = df_1d['Target'].nunique() palette = sns.color_palette("husl", n_categories) plt.figure(figsize=(8, 2)) sns.scatterplot(data=df_1d, x='Component 1', y='y', hue='Target', palette=palette, linewidth=0.2, edgecolor='white', legend=False) plt.title('1D MDS Projection') plt.xlabel('Component 1') plt.yticks([]) # plt.legend(title='Target', bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.show() def evaluate(data_std, X_custom, X_2d, X_1d): custom_projection = X_custom[['x', 'y']] # Calculate the high-dimensional distances hd_euclidean_dist = euclidean_distances(data_std) hd_cosine_dist = cosine_distances(data_std) hd_manhattan_dist = manhattan_distances(data_std) # Calculate the low-dimensional distances for each projection ld_euclidean_dist_2d = euclidean_distances(X_2d) ld_euclidean_dist_1d = euclidean_distances(X_1d) ld_euclidean_dist_custom = euclidean_distances(custom_projection) ld_cosine_dist_2d = cosine_distances(X_2d) ld_cosine_dist_1d = cosine_distances(X_1d) ld_cosine_dist_custom = cosine_distances(custom_projection) ld_manhattan_dist_2d = manhattan_distances(X_2d) ld_manhattan_dist_1d = manhattan_distances(X_1d) ld_manhattan_dist_custom = manhattan_distances(custom_projection) def calculate_trustworthiness(X_high, X_low, n_neighbors=5, distance='euclidean'): if distance == 'cosine': X_high = cosine_distances(X_high) X_low = cosine_distances(X_low) elif distance == 'manhattan': X_high = manhattan_distances(X_high) X_low = manhattan_distances(X_low) N = X_high.shape[0] nn_orig = NearestNeighbors(n_neighbors=n_neighbors + 1, metric=distance).fit(X_high) _, indices_orig = nn_orig.kneighbors(X_high) nn_proj = NearestNeighbors(n_neighbors=N, metric=distance).fit(X_low) _, indices_proj = nn_proj.kneighbors(X_low) rank_matrix = np.full((N, n_neighbors), N) for i in range(N): for j in range(1, n_neighbors + 1): if indices_orig[i, j] in indices_proj[i, 1:]: rank_matrix[i, j - 1] = np.where(indices_proj[i] == indices_orig[i, j])[0][0] rank_matrix -= (n_neighbors + 1) trustworthiness = 1 - (2.0 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1)) * np.sum(rank_matrix[rank_matrix > n_neighbors] - n_neighbors)) return trustworthiness def calculate_continuity(X_high, X_low, n_neighbors=5, distance='euclidean'): if distance == 'euclidean': X_high = euclidean_distances(X_high) X_low = euclidean_distances(X_low) elif distance == 'cosine': X_high = cosine_distances(X_high) X_low = cosine_distances(X_low) elif distance == 'manhattan': X_high = manhattan_distances(X_high) X_low = manhattan_distances(X_low) N = X_high.shape[0] nn_low = NearestNeighbors(n_neighbors=n_neighbors + 1, metric=distance).fit(X_low) _, indices_low = nn_low.kneighbors(X_low) nn_high = NearestNeighbors(n_neighbors=N, metric=distance).fit(X_high) _, indices_high = nn_high.kneighbors(X_high) rank_matrix = np.full((N, n_neighbors), N) for i in range(N): for j in range(1, n_neighbors + 1): if indices_low[i, j] in indices_high[i, 1:]: rank_matrix[i, j - 1] = np.where(indices_high[i] == indices_low[i, j])[0][0] rank_matrix -= (n_neighbors + 1) continuity = 1 - (2.0 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1)) * np.sum(rank_matrix[rank_matrix > n_neighbors] - n_neighbors)) return continuity def calculate_stress(hd_distances, ld_distances): return np.sqrt(np.sum((hd_distances - ld_distances)**2) / np.sum(hd_distances**2)) def calculate_correlation(hd_distances, ld_distances): return pearsonr(hd_distances.flatten(), ld_distances.flatten())[0] def calculate_trustworthiness(X_high, X_low, n_neighbors=5, distance='euclidean'): N = X_high.shape[0] # Depending on the distance metric, calculate the distances if distance == 'cosine': high_dist = cosine_distances(X_high) low_dist = cosine_distances(X_low) elif distance == 'manhattan': high_dist = manhattan_distances(X_high) low_dist = manhattan_distances(X_low) else: # Euclidean high_dist = euclidean_distances(X_high) low_dist = euclidean_distances(X_low) # Compute nearest neighbors in the original high-dimensional space nn_orig = NearestNeighbors(n_neighbors=n_neighbors + 1, metric=distance).fit(X_high) _, indices_orig = nn_orig.kneighbors(X_high) # Compute nearest neighbors in the low-dimensional space nn_proj = NearestNeighbors(n_neighbors=N, metric=distance).fit(X_low) _, indices_proj = nn_proj.kneighbors(X_low) # Calculate the rank matrix rank_matrix = np.full((N, n_neighbors), N) for i in range(N): for j in range(1, n_neighbors + 1): high_neighbor = indices_orig[i, j] if high_neighbor in indices_proj[i]: low_neighbor_rank = np.where(indices_proj[i] == high_neighbor)[0][0] rank_matrix[i, j - 1] = low_neighbor_rank # Subtract (n_neighbors + 1) from each element in the rank matrix rank_matrix -= (n_neighbors + 1) # Calculate the trustworthiness trustworthiness = 1 - (2.0 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1)) * np.sum(rank_matrix[rank_matrix > 0])) return trustworthiness # Assuming 'data_std', 'X_transformed', 'd_1d', and 'custom_projection' are defined # Calculating distances for Euclidean, Cosine, and Manhattan # Function to calculate average distance (as a proxy for compactness) def calculate_average_distance(X, distance='euclidean'): if distance == 'cosine': dist = cosine_distances(X) elif distance == 'manhattan': dist = manhattan_distances(X) else: # Euclidean dist = euclidean_distances(X) return np.mean(dist) # Calculate average distance for each projection and distance metric avg_dist_euclidean_2d = calculate_average_distance(X_2d, distance='euclidean') avg_dist_euclidean_1d = calculate_average_distance(X_1d, distance='euclidean') avg_dist_euclidean_custom = calculate_average_distance(custom_projection, distance='euclidean') avg_dist_cosine_2d = calculate_average_distance(X_2d, distance='cosine') avg_dist_cosine_1d = calculate_average_distance(X_1d, distance='cosine') avg_dist_cosine_custom = calculate_average_distance(custom_projection, distance='cosine') avg_dist_manhattan_2d = calculate_average_distance(X_2d, distance='manhattan') avg_dist_manhattan_1d = calculate_average_distance(X_1d, distance='manhattan') avg_dist_manhattan_custom = calculate_average_distance(custom_projection, distance='manhattan') # Euclidean distances hd_euclidean_dist = euclidean_distances(data_std) ld_euclidean_dist_2d = euclidean_distances(X_2d) ld_euclidean_dist_1d = euclidean_distances(X_1d) ld_euclidean_dist_custom = euclidean_distances(custom_projection) # Cosine distances hd_cosine_dist = cosine_distances(data_std) ld_cosine_dist_2d = cosine_distances(X_2d) ld_cosine_dist_1d = cosine_distances(X_1d) ld_cosine_dist_custom = cosine_distances(custom_projection) # Manhattan distances hd_manhattan_dist = manhattan_distances(data_std) ld_manhattan_dist_2d = manhattan_distances(X_2d) ld_manhattan_dist_1d = manhattan_distances(X_1d) ld_manhattan_dist_custom = manhattan_distances(custom_projection) # Stress and correlation calculations for each metric # Euclidean stress_euclidean_2d = calculate_stress(hd_euclidean_dist, ld_euclidean_dist_2d) stress_euclidean_1d = calculate_stress(hd_euclidean_dist, ld_euclidean_dist_1d) stress_euclidean_custom = calculate_stress(hd_euclidean_dist, ld_euclidean_dist_custom) correlation_euclidean_2d = calculate_correlation(hd_euclidean_dist, ld_euclidean_dist_2d) correlation_euclidean_1d = calculate_correlation(hd_euclidean_dist, ld_euclidean_dist_1d) correlation_euclidean_custom = calculate_correlation(hd_euclidean_dist, ld_euclidean_dist_custom) # Cosine stress_cosine_2d = calculate_stress(hd_cosine_dist, ld_cosine_dist_2d) stress_cosine_1d = calculate_stress(hd_cosine_dist, ld_cosine_dist_1d) stress_cosine_custom = calculate_stress(hd_cosine_dist, ld_cosine_dist_custom) correlation_cosine_2d = calculate_correlation(hd_cosine_dist, ld_cosine_dist_2d) correlation_cosine_1d = calculate_correlation(hd_cosine_dist, ld_cosine_dist_1d) correlation_cosine_custom = calculate_correlation(hd_cosine_dist, ld_cosine_dist_custom) # Manhattan stress_manhattan_2d = calculate_stress(hd_manhattan_dist, ld_manhattan_dist_2d) stress_manhattan_1d = calculate_stress(hd_manhattan_dist, ld_manhattan_dist_1d) stress_manhattan_custom = calculate_stress(hd_manhattan_dist, ld_manhattan_dist_custom) correlation_manhattan_2d = calculate_correlation(hd_manhattan_dist, ld_manhattan_dist_2d) correlation_manhattan_1d = calculate_correlation(hd_manhattan_dist, ld_manhattan_dist_1d) correlation_manhattan_custom = calculate_correlation(hd_manhattan_dist, ld_manhattan_dist_custom) # Silhouette scores for each metric silhouette_euclidean_2d = silhouette_score(X_2d, y) silhouette_euclidean_1d = silhouette_score(X_1d, y) silhouette_euclidean_custom = silhouette_score(custom_projection, y) silhouette_cosine_2d = silhouette_score(X_2d, y, metric='cosine') silhouette_cosine_1d = silhouette_score(X_1d, y, metric='cosine') silhouette_cosine_custom = silhouette_score(custom_projection,y, metric='cosine') silhouette_manhattan_2d = silhouette_score(X_2d,y, metric='manhattan') silhouette_manhattan_1d = silhouette_score(X_1d, y, metric='manhattan') silhouette_manhattan_custom = silhouette_score(custom_projection, y, metric='manhattan') # Euclidean trust_euclidean_2d = calculate_trustworthiness(data_std, X_2d, distance='euclidean') trust_euclidean_1d = calculate_trustworthiness(data_std, X_1d, distance='euclidean') trust_euclidean_custom = calculate_trustworthiness(data_std, custom_projection, distance='euclidean') # Cosine trust_cosine_2d = calculate_trustworthiness(data_std, X_2d, distance='cosine') trust_cosine_1d = calculate_trustworthiness(data_std, X_1d, distance='cosine') trust_cosine_custom = calculate_trustworthiness(data_std, custom_projection, distance='cosine') # Manhattan trust_manhattan_2d = calculate_trustworthiness(data_std, X_2d, distance='manhattan') trust_manhattan_1d = calculate_trustworthiness(data_std, X_1d, distance='manhattan') trust_manhattan_custom = calculate_trustworthiness(data_std, custom_projection, distance='manhattan') # Your code to display or analyze the calculated metrics goes here print("2D MDS Metrics:") print(f" Euclidean - Stress: {stress_euclidean_2d}, Correlation: {correlation_euclidean_2d}, Silhouette: {silhouette_euclidean_2d}, Trustworthiness: {trust_euclidean_2d}, Avg Dist: {avg_dist_euclidean_2d}") print(f" Cosine - Stress: {stress_cosine_2d}, Correlation: {correlation_cosine_2d}, Silhouette: {silhouette_cosine_2d}, Trustworthiness: {trust_cosine_2d}, Avg Dist: {avg_dist_cosine_2d}") print(f" Manhattan - Stress: {stress_manhattan_2d}, Correlation: {correlation_manhattan_2d}, Silhouette: {silhouette_manhattan_2d}, Trustworthiness: {trust_manhattan_2d}, Avg Dist: {avg_dist_manhattan_2d}") print("\n") print("1D MDS Metrics:") print(f" Euclidean - Stress: {stress_euclidean_1d}, Correlation: {correlation_euclidean_1d}, Silhouette: {silhouette_euclidean_1d}, Trustworthiness: {trust_euclidean_1d}, Avg Dist: {avg_dist_euclidean_1d}") print(f" Cosine - Stress: {stress_cosine_1d}, Correlation: {correlation_cosine_1d}, Silhouette: {silhouette_cosine_1d}, Trustworthiness: {trust_cosine_1d}, Avg Dist: {avg_dist_cosine_1d}") print(f" Manhattan - Stress: {stress_manhattan_1d}, Correlation: {correlation_manhattan_1d}, Silhouette: {silhouette_manhattan_1d}, Trustworthiness: {trust_manhattan_1d}, Avg Dist: {avg_dist_manhattan_1d}") print("\n") print("Custom Projection Metrics:") print(f" Euclidean - Stress: {stress_euclidean_custom}, Correlation: {correlation_euclidean_custom}, Silhouette: {silhouette_euclidean_custom}, Trustworthiness: {trust_euclidean_custom}, Avg Dist: {avg_dist_euclidean_custom}") print(f" Cosine - Stress: {stress_cosine_custom}, Correlation: {correlation_cosine_custom}, Silhouette: {silhouette_cosine_custom}, Trustworthiness: {trust_cosine_custom}, Avg Dist: {avg_dist_cosine_custom}") print(f" Manhattan - Stress: {stress_manhattan_custom}, Correlation: {correlation_manhattan_custom}, Silhouette: {silhouette_manhattan_custom}, Trustworthiness: {trust_manhattan_custom}, Avg Dist: {avg_dist_manhattan_custom}") print("\n") return { "2D MDS Metrics": { "Euclidean": { "Stress": stress_euclidean_2d, "Correlation": correlation_euclidean_2d, "Silhouette": silhouette_euclidean_2d, "Trustworthiness": trust_euclidean_2d, "Avg Dist": avg_dist_euclidean_2d }, "Cosine": { "Stress": stress_cosine_2d, "Correlation": correlation_cosine_2d, "Silhouette": silhouette_cosine_2d, "Trustworthiness": trust_cosine_2d, "Avg Dist": avg_dist_cosine_2d }, "Manhattan": { "Stress": stress_manhattan_2d, "Correlation": correlation_manhattan_2d, "Silhouette": silhouette_manhattan_2d, "Trustworthiness": trust_manhattan_2d, "Avg Dist": avg_dist_manhattan_2d } }, "1D MDS Metrics": { "Euclidean": { "Stress": stress_euclidean_1d, "Correlation": correlation_euclidean_1d, "Silhouette": silhouette_euclidean_1d, "Trustworthiness": trust_euclidean_1d, "Avg Dist": avg_dist_euclidean_1d }, "Cosine": { "Stress": stress_cosine_1d, "Correlation": correlation_cosine_1d, "Silhouette": silhouette_cosine_1d, "Trustworthiness": trust_cosine_1d, "Avg Dist": avg_dist_cosine_1d }, "Manhattan": { "Stress": stress_manhattan_1d, "Correlation": correlation_manhattan_1d, "Silhouette": silhouette_manhattan_1d, "Trustworthiness": trust_manhattan_1d, "Avg Dist": avg_dist_manhattan_1d } }, "Custom Projection Metrics": { "Euclidean": { "Stress": stress_euclidean_custom, "Correlation": correlation_euclidean_custom, "Silhouette": silhouette_euclidean_custom, "Trustworthiness": trust_euclidean_custom, "Avg Dist": avg_dist_euclidean_custom }, "Cosine": { "Stress": stress_cosine_custom, "Correlation": correlation_cosine_custom, "Silhouette": silhouette_cosine_custom, "Trustworthiness": trust_cosine_custom, "Avg Dist": avg_dist_cosine_custom }, "Manhattan": { "Stress": stress_manhattan_custom, "Correlation": correlation_manhattan_custom, "Silhouette": silhouette_manhattan_custom, "Trustworthiness": trust_manhattan_custom, "Avg Dist": avg_dist_manhattan_custom } } } def preprocess_iris(): iris = datasets.load_iris(as_frame=True) iris_std = iris.data - iris.data.mean() y = iris.target X = iris_std # hd_dist_mat = cosine_distances(iris_std.to_numpy()) / 2 return X, y, iris_std def preprocess_2dsphere(n_points=100, n_classes=2, d=2): data = skdim.datasets.hyperSphere(n=n_points, d=d) df = pd.DataFrame(data, columns=['x', 'y']) # Classify points based on the x coordinate being positive or negative y = np.where(df['x'] >= 0, 1, 0) scaler = StandardScaler() data_std = scaler.fit_transform(df) return df, y, data_std def preprocess_3dsphere(n_points=100, n_classes=2, d=3): data = skdim.datasets.hyperSphere(n=n_points, d=d) df = pd.DataFrame(data, columns=['x', 'y', 'z']) y = np.where(df['x'] >= 0, 1, 0) scaler = StandardScaler() data_std = scaler.fit_transform(df) return df, y, data_std def preprocess_4dsphere(n_points=100, n_classes=2, d=4): data = skdim.datasets.hyperSphere(n=n_points, d=d) df = pd.DataFrame(data, columns=['x', 'y', 'z', 'a']) y = np.where(df['x'] >= 0, 1, 0) scaler = StandardScaler() data_std = scaler.fit_transform(df) return df, y, data_std def preprocess_5dsphere(n_points=100, n_classes=2, d=5): data = skdim.datasets.hyperSphere(n=n_points, d=d) df = pd.DataFrame(data, columns=['x', 'y', 'z', 'a', 'b']) y = np.where(df['x'] >= 0, 1, 0) scaler = StandardScaler() data_std = scaler.fit_transform(df) return df, y, data_std def preprocess_s_curve(n_points=1000): from sklearn.datasets import make_s_curve data, t = make_s_curve(n_points, random_state=777) data_mean_subtracted = data - np.mean(data, axis=0) labels = np.where(t > np.median(t), 1, 0) df = pd.DataFrame(data_mean_subtracted, columns=['x', 'y', 'z']) return data_mean_subtracted, labels, df def preprocess_citations(sample_frac=1): import json json_path = "data/papers_with_keys.json" with open(json_path) as file: data = json.load(file) papers_df = pd.DataFrame(data['papers']) papers_df_sampled = papers_df.sample(frac=sample_frac, random_state=777) features = ['citationCount', 'referenceCount', 'year'] papers_mean_subtracted = papers_df_sampled[features] - papers_df_sampled[features].mean() papers_df_sampled['layer'] = papers_df_sampled['layer'].astype('category') # Assuming papers_mean_subtracted serves both as D and data_std return papers_mean_subtracted, papers_df_sampled['layer'], papers_mean_subtracted def preprocess_concentric_circles(num_circles=3, num_points=100): # Initialize arrays to store data all_x = [] all_y = [] classes = [] # Generate points for each circle for i in range(num_circles): radius = 1 + i * 0.5 # Incrementing radius for each circle theta = np.linspace(0, 2 * np.pi, num_points) x = radius * np.cos(theta) y = radius * np.sin(theta) all_x.extend(x) all_y.extend(y) classes.extend([i] * num_points) # Assign class 'i' for each circle # Convert to DataFrame df = pd.DataFrame({'x': all_x, 'y': all_y, 'class': classes}) # Subtract the mean from each feature df[['x', 'y']] = df[['x', 'y']] - df[['x', 'y']].mean() # Extracting X and y X = df[['x', 'y']].values # Features with mean subtracted y = df['class'].values # Classes return X, y, df from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import seaborn as sns def preprocess_blobs(n_points=80, n_clusters=4): centers = [(-5, -5), (5, -5), (-5, 5), (5, 5)] data, labels = make_blobs(n_samples=n_points, centers=centers, n_features=2, random_state=777) df = pd.DataFrame(data, columns=['x', 'y']) df_shifted = df - df.mean() plt.figure(figsize=(6, 6)) for i in range(n_clusters): plt.scatter(df_shifted['x'][labels == i], df_shifted['y'][labels == i], label=f'Cluster {i}', edgecolor='k', s=50, alpha=0.7) plt.title("Blobs in Quadrants Visualization") plt.xlabel("Feature 1") plt.ylabel("Feature 2") plt.legend() plt.grid(True) plt.show() return df_shifted, labels, df_shifted def preprocess_penguins(): penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True) features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] penguins_features = penguins[features] penguins_std = penguins_features - penguins_features.mean() hd_dist_mat = cosine_distances(penguins_std) / 2 target = 'species' species_mapping = {species: idx for idx, species in enumerate(penguins[target].unique())} y = penguins[target].map(species_mapping).values sns.pairplot(penguins_std.join(penguins[target]), hue=target) plt.suptitle('Pairplot of Standardized Penguin Features', verticalalignment='top') return penguins_std, y, penguins_std def preprocess_penguins_around_max(): penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True) features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] penguins_features = penguins[features] # Step 1: Center the data by subtracting the mean penguins_centered = penguins_features - penguins_features.mean() # Find the maximum value point in the centered data max_point = penguins_centered.max() # Assuming specified_point is a list or array of the desired shift in each dimension specified_point = [1, 1, 1, 1] # Example: shift each feature so that the maximum is at 1 # Calculate the difference between the specified point and the maximum value point shift = specified_point - max_point # Step 2: Shift the centered data to the specified point penguins_std = penguins_centered + shift hd_dist_mat = cosine_distances(penguins_std) / 2 target = 'species' species_mapping = {species: idx for idx, species in enumerate(penguins[target].unique())} y = penguins[target].map(species_mapping).values sns.pairplot(penguins_std.join(penguins[target]), hue=target) plt.suptitle('Pairplot of Standardized Penguin Features', verticalalignment='top') return penguins_std, y, penguins_std def preprocess_penguins_around_min_halfway(): penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True) features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] penguins_features = penguins[features] # Step 1: Center the data by subtracting the mean penguins_centered = penguins_features - penguins_features.mean() # Find the minimum value point in the centered data min_point = penguins_centered.min() # Assuming specified_point is a list or array of the desired shift in each dimension specified_point = [1, 1, 1, 1] # Example: shift each feature so that the minimum is halfway to 1 # Calculate the halfway point between the specified point and the minimum value point halfway_shift = (specified_point + min_point) / 2 # Calculate the final shift to apply to the centered data shift = halfway_shift - min_point # Step 2: Shift the centered data to the halfway point penguins_std = penguins_centered + shift # Calculate the pairwise cosine distances hd_dist_mat = cosine_distances(penguins_std) / 2 target = 'species' species_mapping = {species: idx for idx, species in enumerate(penguins[target].unique())} y = penguins[target].map(species_mapping).values # Pairplot of the standardized features sns.pairplot(penguins_std.join(penguins[[target]]), hue=target) plt.suptitle('Pairplot of Standardized Penguin Features Shifted Halfway', verticalalignment='top') return penguins_std, y, penguins_std def preprocess_unbalanced(): np.random.seed(777) n_cluster_points = 100 n_isolated_points = 3 cluster = np.random.normal(loc=[-2, 0], scale=0.5, size=(n_cluster_points, 2)) isolated_points = np.array([[5, 0], [5, 1], [5, -1]]) data = np.vstack([cluster, isolated_points]) df = pd.DataFrame(data, columns=['x', 'y']) df['target'] = [0] * n_cluster_points + list(range(1, n_isolated_points + 1)) data_std = data - data.mean() hd_dist_mat = cosine_distances(data_std) / 2 y = df['target'].values plt.figure(figsize=(8, 6)) sns.scatterplot(data=df, x='x', y='y', hue='target', palette='viridis', style='target', markers=True) plt.title('Unbalanced Dataset Visualization') plt.xlabel('Standardized X') plt.ylabel('Standardized Y') plt.legend(title='Target') plt.show() return data_std, y, data_std from sklearn.datasets import make_blobs import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from mpl_toolkits.mplot3d.art3d import Line3DCollection def preprocess_3d_blobs_v1(n_points=400, n_clusters=8, cluster_std=0.5): # Define cluster centers for 8 clusters in 3D centers = [(-5, -5, -5), (5, -5, -5), (-5, 5, -5), (5, 5, -5), (-5, -5, 5), (5, -5, 5), (-5, 5, 5), (5, 5, 5)] X, y = make_blobs(n_samples=n_points, centers=centers, cluster_std=cluster_std, n_features=3, random_state=42) # Convert to DataFrame for easier plotting and manipulation df = pd.DataFrame(X, columns=['x', 'y', 'z']) df['target'] = y # Normalize data to range [-1, 1] and center the mean to 0 df[['x', 'y', 'z']] = (df[['x', 'y', 'z']] - df[['x', 'y', 'z']].mean()) / (df[['x', 'y', 'z']].max() - df[['x', 'y', 'z']].min()) * 2 # Plot fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d', facecolor='white') ax.grid(True, linestyle='-', color='whitesmoke', alpha=0.8) ax.set_xticks([-1, -0.5, 0, 0.5, 1]) ax.set_yticks([-1, -0.5, 0, 0.5, 1]) ax.set_zticks([-1, -0.5, 0, 0.5, 1]) # Define color palette palette = sns.color_palette("husl", n_clusters) # Plot each cluster with different color for i in range(n_clusters): ax.scatter(df.loc[df['target'] == i, 'x'], df.loc[df['target'] == i, 'y'], df.loc[df['target'] == i, 'z'], color=palette[i], edgecolor='white', linewidth=0.5, marker='o', s=20) ax.set_title('3D Blob Dataset with 8 Clusters (Normalized)') ax.set_xlabel('X axis') ax.set_ylabel('Y axis') ax.set_zlabel('Z axis') plt.show() print(df, y, df) return df.drop('target', axis=1), y, df def preprocess_3d_blobs_v2(n_points=400, n_clusters=8, cluster_std=1): # Define cluster centers for 8 clusters in 3D centers = [(-5, -5, -5), (5, -5, -5), (-5, 5, -5), (5, 5, -5), (-5, -5, 5), (5, -5, 5), (-5, 5, 5), (5, 5, 5)] X, y = make_blobs(n_samples=n_points, centers=centers, cluster_std=cluster_std, n_features=3, random_state=42) # Convert to DataFrame for easier plotting and manipulation df = pd.DataFrame(X, columns=['x', 'y', 'z']) df['target'] = y # Normalize data to range [-1, 1] and center the mean to 0 df[['x', 'y', 'z']] = (df[['x', 'y', 'z']] - df[['x', 'y', 'z']].mean()) / (df[['x', 'y', 'z']].max() - df[['x', 'y', 'z']].min()) * 2 # Plot fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d', facecolor='white') ax.grid(True, linestyle='-', color='whitesmoke', alpha=0.8) ax.set_xticks([-1, -0.5, 0, 0.5, 1]) ax.set_yticks([-1, -0.5, 0, 0.5, 1]) ax.set_zticks([-1, -0.5, 0, 0.5, 1]) # Define color palette palette = sns.color_palette("husl", n_clusters) # Plot each cluster with different color for i in range(n_clusters): ax.scatter(df.loc[df['target'] == i, 'x'], df.loc[df['target'] == i, 'y'], df.loc[df['target'] == i, 'z'], color=palette[i], edgecolor='white', linewidth=0.5, marker='o', s=20) ax.set_title('3D Blob Dataset with 8 Clusters (Normalized)') ax.set_xlabel('X axis') ax.set_ylabel('Y axis') ax.set_zlabel('Z axis') plt.show() print(df, y, df) return df.drop('target', axis=1), y, df def preprocess_3d_blobs_v3(n_points=400, n_clusters=8, cluster_std=4): # Define cluster centers for 8 clusters in 3D centers = [(-5, -5, -5), (5, -5, -5), (-5, 5, -5), (5, 5, -5), (-5, -5, 5), (5, -5, 5), (-5, 5, 5), (5, 5, 5)] X, y = make_blobs(n_samples=n_points, centers=centers, cluster_std=cluster_std, n_features=3, random_state=42) # Convert to DataFrame for easier plotting and manipulation df = pd.DataFrame(X, columns=['x', 'y', 'z']) df['target'] = y # Normalize data to range [-1, 1] and center the mean to 0 df[['x', 'y', 'z']] = (df[['x', 'y', 'z']] - df[['x', 'y', 'z']].mean()) / (df[['x', 'y', 'z']].max() - df[['x', 'y', 'z']].min()) * 2 # Plot fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d', facecolor='white') ax.grid(True, linestyle='-', color='whitesmoke', alpha=0.8) ax.set_xticks([-1, -0.5, 0, 0.5, 1]) ax.set_yticks([-1, -0.5, 0, 0.5, 1]) ax.set_zticks([-1, -0.5, 0, 0.5, 1]) # Define color palette palette = sns.color_palette("husl", n_clusters) # Plot each cluster with different color for i in range(n_clusters): ax.scatter(df.loc[df['target'] == i, 'x'], df.loc[df['target'] == i, 'y'], df.loc[df['target'] == i, 'z'], color=palette[i], edgecolor='white', linewidth=0.5, marker='o', s=20) ax.set_title('3D Blob Dataset with 8 Clusters (Normalized)') ax.set_xlabel('X axis') ax.set_ylabel('Y axis') ax.set_zlabel('Z axis') plt.show() print(df, y, df) return df.drop('target', axis=1), y, df def preprocess_3d_blobs_v4(n_points=400, n_clusters=8, cluster_std=8): # Define cluster centers for 8 clusters in 3D centers = [(-5, -5, -5), (5, -5, -5), (-5, 5, -5), (5, 5, -5), (-5, -5, 5), (5, -5, 5), (-5, 5, 5), (5, 5, 5)] X, y = make_blobs(n_samples=n_points, centers=centers, cluster_std=cluster_std, n_features=3, random_state=42) # Convert to DataFrame for easier plotting and manipulation df = pd.DataFrame(X, columns=['x', 'y', 'z']) df['target'] = y # Normalize data to range [-1, 1] and center the mean to 0 df[['x', 'y', 'z']] = (df[['x', 'y', 'z']] - df[['x', 'y', 'z']].mean()) / (df[['x', 'y', 'z']].max() - df[['x', 'y', 'z']].min()) * 2 # Plot fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d', facecolor='white') ax.grid(True, linestyle='-', color='whitesmoke', alpha=0.8) ax.set_xticks([-1, -0.5, 0, 0.5, 1]) ax.set_yticks([-1, -0.5, 0, 0.5, 1]) ax.set_zticks([-1, -0.5, 0, 0.5, 1]) # Define color palette palette = sns.color_palette("husl", n_clusters) # Plot each cluster with different color for i in range(n_clusters): ax.scatter(df.loc[df['target'] == i, 'x'], df.loc[df['target'] == i, 'y'], df.loc[df['target'] == i, 'z'], color=palette[i], edgecolor='white', linewidth=0.5, marker='o', s=20) ax.set_title('3D Blob Dataset with 8 Clusters (Normalized)') ax.set_xlabel('X axis') ax.set_ylabel('Y axis') ax.set_zlabel('Z axis') plt.show() print(df, y, df) return df.drop('target', axis=1), y, df # Re-import necessary libraries and redefine the function to create a flatter torus with denser distribution import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from matplotlib import cm def preprocess_3d_torus(n_points=30, R=3, r=0.6, n_targets=5, noise_level=0.05): theta = np.linspace(0, 2 * np.pi, n_points) phi = np.linspace(0, 2 * np.pi, n_points) theta, phi = np.meshgrid(theta, phi) theta = theta.flatten() phi = phi.flatten() x = (R + r * np.cos(theta)) * np.cos(phi) y = (R + r * np.cos(theta)) * np.sin(phi) z = r * np.sin(theta) * 0.1 x += np.random.normal(0, noise_level, x.shape) y += np.random.normal(0, noise_level, y.shape) z += np.random.normal(0, noise_level, z.shape) df = pd.DataFrame({'x': x, 'y': y, 'z': z}) df[['x', 'y']] = (df[['x', 'y']] - df[['x', 'y']].mean()) / (df[['x', 'y']].max() - df[['x', 'y']].min()) * 2 df['target'] = np.floor(((phi / (2 * np.pi)) * n_targets)).astype(int) num_points = len(df) df['continuous_target'] = np.linspace(0, 1, num_points) fig = plt.figure(figsize=(12, 12)) ax = fig.add_subplot(111, projection='3d', facecolor='white') ax.grid(False) # This disables the grid from matplotlib.colors import ListedColormap num_colors = len(df['continuous_target'].unique()) husl_colors = sns.color_palette("husl", num_colors) husl_cmap = ListedColormap(husl_colors) colormap = plt.get_cmap('hsv') sc = ax.scatter(df['x'], df['y'], df['z'], c=df['continuous_target'], cmap=husl_cmap, edgecolor='white', linewidth=0.5, s=20) ax.set_title('Varied 3D Torus Dataset with Continuous Color Scale') ax.set_xlabel('X axis') ax.set_ylabel('Y axis') ax.set_zlabel('Z axis') ax.set_xlim(-1, 1) ax.set_ylim(-1, 1) ax.set_zlim(-0.5, 0.5) ax.set_axis_off() plt.show() return df.drop(['target', 'continuous_target'], axis=1), df['continuous_target'].values, df def preprocess_concentric_circles(num_circles=3, num_points=100): all_x = [] all_y = [] classes = [] for i in range(num_circles): radius = 1 + i * 0.5 theta = np.linspace(0, 2 * np.pi, num_points) x = radius * np.cos(theta) y = radius * np.sin(theta) all_x.extend(x) all_y.extend(y) classes.extend([i] * num_points) df = pd.DataFrame({'x': all_x, 'y': all_y, 'class': classes}) df[['x', 'y']] = df[['x', 'y']] - df[['x', 'y']].mean() X = df[['x', 'y']].values y = df['class'].values return X, y, df def preprocess_concentric_circles(num_circles=3, num_points=100): all_x = [] all_y = [] classes = [] for i in range(num_circles): radius = 1 + i * 0.5 theta = np.linspace(0, 2 * np.pi, num_points) x = radius * np.cos(theta) y = radius * np.sin(theta) all_x.extend(x) all_y.extend(y) classes.extend([i] * num_points) df = pd.DataFrame({'x': all_x, 'y': all_y, 'class': classes}) df[['x', 'y']] = df[['x', 'y']] - df[['x', 'y']].mean() X = df[['x', 'y']].values y = df['class'].values return X, y, df evaluation_file_path = "evaluation_output/evaluation_data.txt" loss_file_path = "evaluation_output/loss_history.txt" if os.path.exists(evaluation_file_path): os.remove(evaluation_file_path) if os.path.exists(loss_file_path): os.remove(loss_file_path) n_evaluations = 5 def evaluate_n_times(data_std, X_custom, X_2d, X_1d, n=5): all_metrics = [] for _ in range(n): metrics = evaluate(data_std, X_custom, X_2d, X_1d) all_metrics.append(metrics) avg_var_metrics = {} for dimension in all_metrics[0]: avg_var_metrics[dimension] = {} for projection in all_metrics[0][dimension]: avg_var_metrics[dimension][projection] = {} for metric in all_metrics[0][dimension][projection]: values = [m[dimension][projection][metric] for m in all_metrics] avg_var_metrics[dimension][projection][metric] = { "Average": np.mean(values), "Variance": np.var(values) } return avg_var_metrics data_preprocessing_funcs = { "iris": preprocess_iris, "2dsphere": preprocess_2dsphere, "3dsphere": preprocess_3dsphere, "4dsphere": preprocess_4dsphere, "5dsphere": preprocess_5dsphere, "blobs": preprocess_blobs, "blobs_3d_v1" : preprocess_3d_blobs_v1, "blobs_3d_v2" : preprocess_3d_blobs_v2, "blobs_3d_v3" : preprocess_3d_blobs_v3, "blobs_3d_v4" : preprocess_3d_blobs_v4, "torus" : preprocess_3d_torus, "concentric_circles" : preprocess_concentric_circles, "penguins" : preprocess_penguins, "penguins" : preprocess_penguins_around_max, "penguins" : preprocess_penguins_around_min_halfway, "unbalanced" : preprocess_unbalanced, "s_curve" : preprocess_s_curve, # "three_circles" : preprocess_concentric_circles, # "citations" : preprocess_citations # will be possible when published } loss_histories = {} for label, func in data_preprocessing_funcs.items(): print("||||||||||||||||||||" + label + "|||||||||||||||") D, y, data_std = func() X_2d = apply_2d_mds(D) X_1d = apply_1d_mds(D) gd_result = gradient_descent(D, n_iter=20, learning_rate=0.02) X_custom = gd_result["data_prepared"] loss = gd_result["loss"] loss_history = gd_result["loss_history"] gd_time = gd_result["time"] data_raw = gd_result["points"] plt.plot(loss_history) plt.title(f'Loss History for {label}') plt.xlabel('Iteration') plt.ylabel('Loss') plt.savefig(f"evaluation_output/loss_images/{label}_loss_history.png") plt.clf() # visualize_cPro(X_custom, y) visualize_2d_mds(X_2d, y) visualize_1d_mds(X_1d, y) evaluation_results = evaluate_n_times(data_std, X_custom, X_2d, X_1d, n=n_evaluations) if 'id' not in X_custom.columns: X_custom['id'] = range(1, len(D) + 1) export_path = f"evaluation_output/projections/{label}_cPro.csv" X_custom.to_csv(export_path, index=False) print(f"Data exported for {label} to {export_path}") with open(evaluation_file_path, 'a') as file: file.write(f"\n--- Evaluation Metrics for {label} ---\n") for dimension, projections in evaluation_results.items(): file.write(f"{dimension}:\n") for projection, metrics in projections.items(): file.write(f" {projection}:\n") for metric, stats in metrics.items(): file.write(f" {metric}: Average = {stats['Average']}, Variance = {stats['Variance']}\n") loss_histories[label] = loss_history with open(loss_file_path, 'a') as file: file.write(f"\n--- Loss History for {label} ---\n") for iteration, loss in enumerate(loss_history, start=1): file.write(f"Iteration {iteration}: Loss {loss}\n") plt.figure(figsize=(10, 6)) for label, loss_history in loss_histories.items(): if isinstance(loss_history, torch.Tensor): if loss_history.ndim > 0: loss_history = loss_history.tolist() else: loss_history = [loss_history.item()] elif isinstance(loss_history, float): loss_history = [loss_history] iterations = list(range(1, len(loss_history) + 1)) plt.plot(iterations, loss_history, label=label) plt.title('Loss History for Different Datasets') plt.xlabel('Iteration') plt.ylabel('Loss') plt.legend() plt.savefig('evaluation_output/loss_images/combined_loss_images.png') plt.show()