* do-file for lecture 9 of VHM 802, Winter 2024
version 18 /* works also with versions 14-17 */
set more off
set scheme stcolor_alt
cd "r:\"

* Manly Example 1.4
import delimited prehist_dog.csv, clear
foreach var of varlist mandbreadth-mol14length {
  egen `var's=std(`var')
  }
matrix dissim distdogs=mandbreadths-mol14lengths, names(group)
matrix list distdogs /* for display only */

* GM salmon farms example (note: datafile not included in course material)
import delimited SiteID.csv, varnames(1) clear
tostring siteid, gen(sitelabel)
gen xcoordkm=xcoord/1000
gen ycoordkm=ycoord/1000
matrix dissim dist3b=xcoordkm ycoordkm if bma=="3b", names(sitelabel)
matrix list dist3b /* for display only */
mds xcoordkm ycoordkm, id(mfsite) noplot config
mdsconfig, title("Euclidean distance") xneg msize(vsmall) mlabsize(vsmall)
* should fit perfectly to map
mds xcoordkm ycoordkm, id(mfsite) method(modern) noplot config
* identical to classical

import delimited seawaydist.csv, varnames(nonames) clear
* note: file not included in course material
foreach var of varlist v2-v23 {
  replace `var'=`var'/1000
  }
mkmat v2-v10 if _n<=9, matrix(swdist3b) rownames(v1)
matrix list swdist3b /* for display only */
mkmat v2-v23, matrix(swdist) rownames(v1)
mdsmat swdist, noplot config
mdsconfig , title("Sea-way distance") xneg aspect(0.6666667) msize(vsmall) mlabsize(vsmall)
mdsmat swdist, method(modern) noplot config
* not the same as classical, but visually almost indistinguishable
mdsconfig , title("Sea-way distance") xneg aspect(0.6666667)
* same as classical when loss(strain) is used
mdsmat swdist, method(modern) loss(strain) noplot config

mkmat v2-v6 if _n<=5, matrix(swdist15) rownames(v1)
matrix list swdist15 /* just for display */
keep if _n<=5
clustermat single swdist15, name(single15) add
cluster dendrogram single15, label(v1) title("GM Farms 1-5: Single linkage") ytitle("L2 dissimilarity (km)")
clustermat average swdist15, name(aver15) add
cluster dendrogram aver15, label(v1) title("GM Farms 1-5: Average linkage") ytitle("L2 dissimilarity (km)")
clustermat complete swdist15, name(compl15) add
cluster dendrogram compl15, label(v1) title("GM Farms 1-5: Complete linkage") ytitle("L2 dissimilarity (km)")

clear
clear matrix
set maxvar 7500
set matsize 7000
import delimited nciarrayxp.csv, varnames(1) clear
matrix dissim dist1_9=gene1-gene6830 if cell<10, names(label)
matrix list dist1_9 /* for display only */
mds gene1-gene6830, id(label) noplot config
mdsconfig , title("Euclidean distance") maxlength(2)
cluster average gene1-gene6830, name(aver_cell)
cluster dendrogram aver_cell, label(label) xlabel(, angle(90) labsize(*.5)) title("")

* Iris data
import delimited iris.csv, clear
set seed 210310
forval i=1(1)10 {
  cluster kmeans sepallength-petalwidth, k(3) start(krandom) name(iris3m`i')
  tab species iris3m`i', row
  scalar wss=0
  scalar mss=0
  foreach var of varlist sepallength-petalwidth {
    quietly anova `var' iris3m`i'
    scalar wss=wss+e(rss)
	scalar mss=mss+e(mss)
    }
  di "Within-cluster sum-of-squares: " wss "  percent of total:  "  wss/(wss+mss)*100
}

foreach var of varlist sepallength-petalwidth {
  egen `var's=std(`var')
  }
forval i=1(1)10 {
  cluster kmeans sepallengths-petalwidths, k(3) start(krandom) name(iris3sm`i')
  tab species iris3sm`i', row
  scalar wss=0
  scalar mss=0
  foreach var of varlist sepallengths-petalwidths {
    quietly anova `var' iris3sm`i'
    scalar wss=wss+e(rss)
    scalar mss=mss+e(mss)
    }
  di "Within-cluster sum-of-squares: " wss "  percent of total:  "  wss/(wss+mss)*100
}

* R command: kmeans(iris[,1:4],centers=3,nstart=100)
