| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 
 | from sklearn.cluster import DBSCANfrom sklearn import metrics
 import pandas as pd
 
 
 def DBSCAN_tune(data, epsList, minList):
 results = []
 for eps in epsList:
 for min_samples in minList:
 try:
 dbscan = DBSCAN(eps=eps, min_samples=min_samples)
 dbscan.fit(data)
 labels = dbscan.labels_
 n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
 outliers = len(labels[labels[:] == -1])
 noiseRatio = outliers / len(labels)
 counter = Counter(labels)
 silhouette = metrics.silhouette_score(data, labels)
 v_measure = metrics.v_measure_score(labels, totalLabel)
 results.append({'eps':eps, 'min_samples':min_samples, 'n_clusters':n_clusters,
 'n_outliers':outliers, 'noiseRatio':noiseRatio, 'distribution':counter,
 'silhouette':silhouette, 'v_measure':v_measure})
 except:
 print(eps, min_samples)
 df = pd.DataFrame(results)
 return df
 
 
 
 statistics = DBSCAN_tune(data, epsList=np.arange(0.1, 5, 0.05), minList=range(2, 15))
 tmp = statistics[statistics['n_clusters'] == 3]
 tmp.sort_values(['v_measure'], ascending=False)
 tmp.sort_values(['silhouette'], ascending=False)
 
 db = DBSCAN(eps=opt_eps, min_samples=opt_minsamples)
 db.fit(data)
 labels = db.labels_
 ratio = len(labels[labels[:] == -1]) / len(labels)
 print('Noise ratio : {:.4f}%'.format(ratio))
 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 print('Num clusters : {}'.format(n_clusters_))
 print('Silhouette_Coefficient : {}'.format(metrics.silhouette_score(data, labels)))
 
 >>> Noise ratio : 0.1615%
 >>> Num clusters : 13
 >>> Silhouette_Coefficient : 0.21903867779598668
 
 |