Skip to content

Keith's Blog

Scikit-learn

Scikit-learn

sklearn.datasets

make_blobs

Generate data

# Generate data sets
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=1000, centers=3, n_features=3, random_state=0, cluster_std=[1,2,3], center_box=(10,65))

sklearn.preprocessing

StandardScaler

standardize data

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# Create an array with 3 clusters in 3-dimensions
X, y = make_blobs(n_samples=1000, centers=3, n_features=3, random_state=0, cluster_std=[1,2,3], center_box=(10,65))

# Standardize the data
X = StandardScaler().fit_transform(X)

sklearn.decomposition

PCA

Apply Principal component analysis (i.e. dim reduce)

from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

# NOTE!!! Assume you already did standardization on dataset

"""
df[col_name] is a dataframe like this:
x0  x1  x2
-0.366353   1.022466    1.166899
-1.179214   1.318905    1.047407
"""

# Perform PCA (w/o limits on n_components)
pca = PCA()
_ = pca.fit_transform(df[col_name])
PC_components = np.arange(pca.n_components_) + 1

Comments