import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Generate synthetic data
np.random.seed(1337)
X, y = make_classification(n_samples=1000, n_features=20)
# Start with a small labeled set
labeled_idx = np.random.choice(len(X), 10, replace=False)
unlabeled_idx = np.setdiff1d(np.arange(len(X)), labeled_idx)
# Plot the data
plt.scatter(X[unlabeled_idx, 0], X[unlabeled_idx, 1], c='blue', label='Unlabeled', s=2)
plt.scatter(X[labeled_idx, 0], X[labeled_idx, 1], c='red', label='Labeled', s=10)
plt.legend()
plt.show()