# Filename: deduplicate.r
#
# Read id codes and associated non-secure attributes.
con <- file(description="qanalysis.dat", open="r")
nmrecs <- scan(file=con, nlines=1, quiet=T)
idcode <- paste("none",1:nmrecs,sep="")
nattbts <- matrix(0,nmrecs,2)
cattbts <- matrix("     ",nmrecs,4)
simval <- array(0,c(nmrecs,nmrecs,3))
tmp <- readLines(con, n=1)
for (i in 1:nmrecs) {
   tmp <- readLines(con, n=1)
   idcode[i] <- tmp[1]
   nattbts[i,1] <- as.numeric(tmp[2])
   nattbts[i,2] <- as.numeric(tmp[3])
   cattbts[i,1] <- tmp[4]
   cattbts[i,2] <- tmp[5]
}
# Read player-pair similarity measure values.
tmp <- readLines(con, n=1)
for (i in 1:(nmrecs - 1)) {
   for (j in (i + 1):nmrecs) {
      tmpraw <- readLines(con, n=1)
      tmp <- unlist(strsplit(tmpraw," +"))
      simval[i,j,1] <- as.numeric(tmp[3])
      simval[i,j,2] <- as.numeric(tmp[4])
      simval[i,j,3] <- as.numeric(tmp[5])
   }
}
close(con)
# Set the threshold value for declaring a duplicate record.
thrshld <- .9
# Compute the total similarity score and print out those player-pairs that
# are judged to be duplicates.
nmsims <- 3
for (i in 1:(nmrecs - 1)) {
   for (j in (i + 1):nmrecs) {
      ts <- 0
      for (k in 1:nmsims) {
         ts <- ts + simval[i,j,k];
      }
      ts <- ts / nmsims
      cat("i= ",i," j= ",j," ts= ",ts,"\n")
      if (ts > thrshld) {
         cat(idcode[i]," and ",idcode[j]," are judged to be duplicates.\n")
      }
   }
}
