from sklearn.neighbors import NearestNeighbors
def calculate_nearest_neighbors(data, k):
knn = NearestNeighbors(n_neighbors=k+1).fit(data)
distances, indices = knn.kneighbors(data)
return distances[:, 1:], indices[:, 1:]
import numpy as np
def SMOTE_improved(X, y, k=5, ratio=1.0, n_iterations=5):
# k: number of nearest neighbors to consider
# ratio: target ratio of synthetic samples to real samples
# n_iterations: number of iterations to perform
# Determine the minority class samples
minority_class = np.min(y)
X_minority = X[y==minority_class]
# Determine the number of synthetic samples to generate
n_synthetic = int(ratio * len(X_minority) - len(X_minority))
# Create synthetic samples
if n_synthetic > 0:
# Perform multiple iterations to consider the global data distribution
for i in range(n_iterations):
# Compute the nearest neighbors for each sample
distances, indices = calculate_nearest_neighbors(X, k)
# Generate synthetic samples
synthetic_samples = []
for j in range(len(X_minority)):
# Select a random minority class sample
idx = np.random.randint(len(X_minority))
sample = X_minority[idx]
# Compute the importance weights for the k nearest neighbors
weights = np.ones(k)
for l in range(k):
neighbor = indices[idx][l]
if y[neighbor] == minority_class:
weights[l] = 1.0
# Compute the distance between the sample and the neighbor
distance = distances[idx][l]
# Compute the inverse distance as the weight
weights[l] = 1.0 / distance
weights /= weights.sum()
# Generate a synthetic sample as a weighted sum of the k nearest neighbors
synthetic_sample = np.zeros_like(sample)
for l in range(k):
neighbor = indices[idx][l]
weight = weights[l]
synthetic_sample += weight * (X[neighbor] - sample)
synthetic_samples.append(sample + synthetic_sample)
# Add the synthetic samples to the data set
X = np.vstack((X, np.array(synthetic_samples)))
y = np.hstack((y, np.array([minority_class]*n_synthetic)))
return X, y
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Generate an imbalanced data set
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
# Split the data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
# Apply SMOTE to the training set
X_train_resampled, y_train_resampled = SMOTE_improved(X_train, y_train, k=5, ratio=1.0, n_iterations=5)
# Train a logistic regression model on the resampled data set
clf = LogisticRegression()
clf.fit(X_train_resampled, y_train_resampled)
# Evaluate the model on the testing set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))