# !Working directory set as the root of /RcodesClustorus is required!
setwd("../")
source('Source_files/routines.R')

library(MASS)
library(tidyverse)
library(bio3d)
library(cowplot)
library(ClusterR)
library(mclust)  
library(ggdendro)
# bio data ------------------------------------------
pdb <- torsion.pdb(read.pdb("6M16"))
##   Note: Accessing on-line PDB file
data <- torsion_to_dat(pdb) %>% filter(type == "B") %>% select(phi, psi, position)

data %>% ggplot(aes(phi, psi)) + geom_point(size = 1) + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*90, limits = c(-180, 180),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*90, limits = c(-180, 180),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi)))

1. Conformal prediction by KDE

data <- cbind(data$phi/180*pi, data$psi/180*pi)
data <- data[-which(is.na(data[,1])|is.na(data[,2])),]

cp.torus.obj <- cp.torus.kde(data, level = 0.1)

b <- cp.torus.obj$cp.torus %>% pivot_longer(4, names_to = "Type", values_to = "value")
g_cp <- ggplot() + geom_contour(aes(phi, psi, z = ifelse(value,1,0), linetype = Type), data = b, size= 1, color = "darkgrey", lineend = "round" ) + 
  geom_point(mapping = aes(x,y), data = data.frame(x = data[,1],y =data[,2]), size = 0.5)  + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle("KDE-based Prediction sets")
g_cp

2. Inductive Conformal Prediction by mixture model

Choice of alpha and J.

Jvec <- 3:35
l <- list()
set.seed(0) 
n <- nrow(data)  
split.id <- NULL
for (j in Jvec){
  l[[j]] <- icp.torus.score(data, split.id = split.id,
                            method = "mixture",
                            mixturefitmethod = "a",
                            param = list(J = j))
}
## EMsinvMmix: fitting vM2 with option  axis-aligned , J= 3 .
## 102030EMsinvMmix: fitting vM2 with option  axis-aligned , J= 4 .
## 1020EMsinvMmix: fitting vM2 with option  axis-aligned , J= 5 .
## 1020304050EMsinvMmix: fitting vM2 with option  axis-aligned , J= 6 .
## 102030405060708090100110120130EMsinvMmix: fitting vM2 with option  axis-aligned , J= 7 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 8 .
## 102030405060708090100EMsinvMmix: fitting vM2 with option  axis-aligned , J= 9 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 10 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 11 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 12 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 13 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 14 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 15 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 16 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 17 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 18 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 19 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 20 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 21 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 22 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 23 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 24 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 25 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 26 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 27 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 28 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 29 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 30 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 31 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 32 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 33 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 34 .
## 102030405060708090100110120130140150160170180190200EMsinvMmix: fitting vM2 with option  axis-aligned , J= 35 .
## 102030405060708090100110120130140150160170180190200
n2 <- l[[10]]$n2
alphavec <- 1:floor(n2/2) / n2
N <- length(alphavec)

### need a data frame (alpha, J, mu, alpha + mu)
out <- data.frame()
for (j in Jvec){
  Mvec <- alphavec
  a <- icp.torus.eval(l[[j]], level = alphavec, eval.point = grid.torus())
  for (i in 1:N){
    Mvec[i] <- sum(a$Chat_e[,i])/10000
  }
  out <- rbind(out, data.frame(alpha = alphavec, J = j, mu = Mvec, criterion = alphavec + Mvec))
}

out <- out %>% mutate(criterion2 = alpha*(3/2) + mu, 
                      criterion3 = sqrt(alpha^2 + mu^2))

out.index <- which.min(out$criterion)
out[out.index,] 
##           alpha  J     mu criterion criterion2 criterion3
## 2207 0.07883817 12 0.1408 0.2196382  0.2590573  0.1613694
Jhat <- out[out.index,2]
alphahat <- out[out.index,1]

Mixture-based prediction sets with chosen alpha and J.

icp.torus <- l[[Jhat]]

ia <- icp.torus.eval(icp.torus, level = alphahat, eval.point = grid.torus())
b <- data.frame(ia$phi,ia$psi, ia$Chat_mix == 1, ia$Chat_max == 1, ia$Chat_e == 1)
colnames(b) <- c("phi","psi","C_mix","C_max","C_e")
b <- b %>%  pivot_longer(5, names_to = "Type", values_to = "value")
g0 <- ggplot() + geom_contour(aes(phi, psi, z = ifelse(value,1,0), linetype = Type), data = b, size= 1, color = "darkgrey", lineend = "round" ) + 
  geom_point(mapping = aes(x,y), size = 0.5, data = data.frame(x = data[,1],y =data[,2])) + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle("Mixture-based Prediction sets")
g0

3. Clustering

3.1 Clustering by existing methods

J <- 3

### Perform naive clustering
## By naive K-means
kmeans.out <- KMeans_rcpp(data,clusters = J)
g_kmeans <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(kmeans.out$clusters)) %>% 
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point()  + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle("Naive K-means")
g_kmeans_grey <- g_kmeans + scale_color_grey()


## By Non-angular Gaussian Mixture (using state-of-the-art mclust)
# define angular distance:
pdist.data2 <- ang.pdist(data) # Use the pairwise L2-angular distance for clustering 

## PAM (Partitioning around medoids - Kaufman and Rousseeuw(1990) )
pam.out <- Cluster_Medoids(as.matrix(pdist.data2),clusters = J)
g_pam <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(pam.out$clusters)) %>% 
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point() + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle("Partitioning around medoids")
g_pam_grey <- g_pam + scale_color_grey()


## K-means in the ambient space with kmeans++ initialization
kmeans.out<-KMeans_rcpp(cbind(cos(data),sin(data)),clusters = J)
g_kmeans2 <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(kmeans.out$clusters)) %>% 
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point() + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle("K-means with chordal distance (in the ambient space) ")
g_kmeans2_grey <- g_kmeans2 + scale_color_grey()


## Hierarchical
hc.complete <- hclust(pdist.data2, method = "complete")
membership <- cutree(hc.complete, J)
g_hier <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(membership)) %>% 
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point() + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle("Hierachical clustering with average L2-Angular distance") 
g_hier_grey <- g_hier + scale_color_grey()

plot_grid(g_kmeans, g_pam, g_kmeans2 + ggtitle("Extrinsic K-means"),
          g_hier + ggtitle("Hierachical clustering"), label_size = 12)

3.2 Clustering by conformal prediction

c <- cluster.assign.torus(data, icp.torus, level = alphahat) 
g_e <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(c$cluster.id.by.ehat)) %>% 
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point()  + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle(paste("Clusters, K=", c$ncluster))
g_e

membership <- c$cluster.id.outlier
membership <- ifelse(membership == max(unique(c$cluster.id.outlier)),"out",membership)
g1 <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(membership)) %>%  
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point()  + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle(paste("Clusters and outliers, K=",length(unique(c$cluster.id.outlier))))
g1

g_mah <- data.frame(phi = data[,1], psi = data[,2], membership = as.factor(c$cluster.id.by.Mah.dist)) %>% 
  ggplot(aes(phi,psi, shape = membership, color = membership)) + geom_point() + theme_bw() +
  xlab(expression(~ phi)) + ylab(expression(~ psi)) + theme(axis.title.y = element_text(angle=0, vjust = 0.5)) +
  scale_x_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) + 
  scale_y_continuous(breaks = c(-2,-1,0,1,2)*pi/2, limits = c(-pi, pi),
                     labels = c(expression(- ~pi),expression(- ~pi/2),"0",expression(~pi/2),expression(~pi))) +
  ggtitle(paste("Clustering by Mahalanobis distance, K=",c$ncluster))
g_mah

3.3 Overlay with fitted ellipses

g2 <- g_e + ggtitle(paste("SADS-CoV, K=", c$ncluster))

level = alphahat
n2 <- icp.torus$n2
ialpha <- floor( (n2 + 1) * level)
t <- icp.torus$mixture$score_ellipse[ialpha]

# Draw.ellipses.bvn.approx(data, icp.torus$mixture$fit$parammat, t, data, c$cluster.id.outlier)
ellipse.param <- icp.torus$mixture$ellipsefit
J <- length(ellipse.param$mu1)

# all_nine_ellipses
theta <- seq(-pi, pi,length.out = 999)
Z <- cbind(cos(theta), sin(theta))

shift <- matrix(0,ncol = 2, nrow = 9)
shift[,1] <- c(0,2*pi,-2*pi)
shift[,2] <- rep(c(0,2*pi,-2*pi), each = 3)


for(j in 1:J){
  mu <- c(ellipse.param$mu1[j], ellipse.param$mu2[j])
  Sinv <- ellipse.param$Sigmainv[[j]]
  c.minus.t <- ellipse.param$c[j] - t
  
  if(c.minus.t < 0){
    cat("skip",j,",")
    next}
  cat("draw",j,",")
  M <- eigen(Sinv/c.minus.t)
  Mmhalf <- M$vectors %*% diag( sqrt(1/M$values) ) %*% t(M$vectors)
  R <- Mmhalf %*% t(Z) 
  for( shift.id in 1:9){
    RR <- R + mu + shift[shift.id,]
    g2 <- g2 + geom_polygon(aes(x = phi, y = psi, shape = as.factor(1), color =as.factor(1)),
                              color = "black", alpha = 0.1, data = data.frame(phi = RR[1,],psi = RR[2,], value = 1))
  }
  
}
## draw 1 ,draw 2 ,draw 3 ,draw 4 ,draw 5 ,draw 6 ,draw 7 ,draw 8 ,skip 9 ,skip 10 ,skip 11 ,skip 12 ,
g2 <- g2 + coord_cartesian(xlim = c(-pi, pi), ylim = c(-pi,pi))
g2