Scikit-learn
sklearn.datasets
make_blobs
Generate data
# Generate data sets
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1000, centers=3, n_features=3, random_state=0, cluster_std=[1,2,3], center_box=(10,65))
sklearn.preprocessing
StandardScaler
standardize data
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
# Create an array with 3 clusters in 3-dimensions
X, y = make_blobs(n_samples=1000, centers=3, n_features=3, random_state=0, cluster_std=[1,2,3], center_box=(10,65))
# Standardize the data
X = StandardScaler().fit_transform(X)
sklearn.decomposition
PCA
Apply Principal component analysis (i.e. dim reduce)
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
# NOTE!!! Assume you already did standardization on dataset
"""
df[col_name] is a dataframe like this:
x0 x1 x2
-0.366353 1.022466 1.166899
-1.179214 1.318905 1.047407
"""
# Perform PCA (w/o limits on n_components)
pca = PCA()
_ = pca.fit_transform(df[col_name])
PC_components = np.arange(pca.n_components_) + 1