1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
| from sklearn.cluster import DBSCAN from sklearn import metrics import pandas as pd
def DBSCAN_tune(data, epsList, minList): results = [] for eps in epsList: for min_samples in minList: try: dbscan = DBSCAN(eps=eps, min_samples=min_samples) dbscan.fit(data) labels = dbscan.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) outliers = len(labels[labels[:] == -1]) noiseRatio = outliers / len(labels) counter = Counter(labels) silhouette = metrics.silhouette_score(data, labels) v_measure = metrics.v_measure_score(labels, totalLabel) results.append({'eps':eps, 'min_samples':min_samples, 'n_clusters':n_clusters, 'n_outliers':outliers, 'noiseRatio':noiseRatio, 'distribution':counter, 'silhouette':silhouette, 'v_measure':v_measure}) except: print(eps, min_samples) df = pd.DataFrame(results) return df
statistics = DBSCAN_tune(data, epsList=np.arange(0.1, 5, 0.05), minList=range(2, 15)) tmp = statistics[statistics['n_clusters'] == 3] tmp.sort_values(['v_measure'], ascending=False) tmp.sort_values(['silhouette'], ascending=False)
db = DBSCAN(eps=opt_eps, min_samples=opt_minsamples) db.fit(data) labels = db.labels_ ratio = len(labels[labels[:] == -1]) / len(labels) print('Noise ratio : {:.4f}%'.format(ratio)) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Num clusters : {}'.format(n_clusters_)) print('Silhouette_Coefficient : {}'.format(metrics.silhouette_score(data, labels)))
>>> Noise ratio : 0.1615% >>> Num clusters : 13 >>> Silhouette_Coefficient : 0.21903867779598668
|