@@ -9,9 +9,6 @@ def inner(*moreargs):
9
9
return func (* args , * moreargs )
10
10
return inner
11
11
12
- Point = Tuple [float , ...]
13
- Centroid = Point
14
-
15
12
def mean (data : Iterable [float ]) -> float :
16
13
'Accurate arithmetic mean'
17
14
data = list (data )
@@ -21,8 +18,11 @@ def transpose(matrix: Iterable[Iterable]) -> Iterable[tuple]:
21
18
'Swap rows with columns for a 2-D array'
22
19
return zip (* matrix )
23
20
21
+ Point = Tuple [float , ...]
22
+ Centroid = Point
23
+
24
24
def dist (p : Point , q : Point , sqrt = sqrt , fsum = fsum , zip = zip ) -> float :
25
- 'Euclidean distance'
25
+ 'Multi-dimensional euclidean distance'
26
26
return sqrt (fsum ((x1 - x2 ) ** 2.0 for x1 , x2 in zip (p , q )))
27
27
28
28
def assign_data (centroids : Sequence [Centroid ], data : Iterable [Point ]) -> Dict [Centroid , Sequence [Point ]]:
@@ -46,10 +46,16 @@ def k_means(data: Iterable[Point], k:int=2, iterations:int=10) -> List[Point]:
46
46
centroids = compute_centroids (labeled .values ())
47
47
return centroids
48
48
49
+ def quality (labeled : Dict [Centroid , Sequence [Point ]]) -> float :
50
+ 'Mean value of squared distances from data to its assigned centroid'
51
+ return mean (dist (c , p ) ** 2 for c , pts in labeled .items () for p in pts )
52
+
53
+
49
54
if __name__ == '__main__' :
50
55
51
56
from pprint import pprint
52
57
58
+ print ('Simple example with six 3-D points clustered into two groups' )
53
59
points = [
54
60
(10 , 41 , 23 ),
55
61
(22 , 30 , 29 ),
@@ -62,12 +68,10 @@ def k_means(data: Iterable[Point], k:int=2, iterations:int=10) -> List[Point]:
62
68
centroids = k_means (points , k = 2 )
63
69
pprint (assign_data (centroids , points ))
64
70
65
- if __name__ == '__main__' :
66
- # https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials
67
- from pprint import pprint
71
+ print ('\n Example with a richer dataset.' )
72
+ print ('See: https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials' )
68
73
69
74
data = [
70
-
71
75
(10 , 30 ),
72
76
(12 , 50 ),
73
77
(14 , 70 ),
@@ -89,8 +93,9 @@ def k_means(data: Iterable[Point], k:int=2, iterations:int=10) -> List[Point]:
89
93
(90 , 160 ),
90
94
]
91
95
92
- # 5583 1338 1202 668 611 409 463
93
- centroids = k_means (data , k = 4 , iterations = 20 )
94
- d = assign_data (centroids , data )
95
- pprint (d )
96
-
96
+ print ('k quality' )
97
+ print ('- -------' )
98
+ for k in range (1 , 8 ):
99
+ centroids = k_means (data , k , iterations = 20 )
100
+ d = assign_data (centroids , data )
101
+ print (f'{ k } { quality (d ) :8,.1f} ' )
0 commit comments