##########INSTRUCTION###############################
#You only need to specify your data folder and csv dataset name in the "INPUTS" section below
#The output dataset will be produced in the folder you specify
#Comment: This program calculates scale scores for each survey taker. The calculated scale scores are just 
#         additional analysis variables attached to your input dataset. If your survey has a complex sample
#         design (e.g., you took a sample from your target population with stratification, clustering, etc.),
#         you should apply your design specifications (strata, cluster, weights, etc.) as you would do when 
#         analyzing other survey variables.
####################################################

##########INPUTS###############################
#Specify your data folder and csv dataset name
folder <- "FOLDER PATH"
dataname <- "student data.csv"

##########SCALING PROGROM######################
#Option for decimal places
options(digits=10)

#Rounding function that rounds .5 up to the next integer
round2 = function(x, n) {
  posneg = sign(x)
  z = abs(x)*10^n
  z = z + 0.5
  z = trunc(z)
  z = z/10^n
  z*posneg
}

#Scale score calculation function
rasch <- function(scalelist=NULL){

	#Stepvalues that are estimated from the national benchmarking. These are the values in Table A4 of the national benchmarking documentation
  #(https://safesupportivelearning.ed.gov/sites/default/files/EDSCLS_Psychometric_Benchmarking_Technical_Report.pdf).
  #These three sets of values are used to calculate the scale scores for any survey takers. 
	c0 <- c("SENGCLC1","SENGCLC2","SENGCLC3","SENGCLC4","SENGCLC7","SENGREL9","SENGREL11","SENGREL12","SENGREL14","SENGREL153","SENGREL17","SENGREL20","SENGREL21","SENGREL29",
			"SENGPAR44","SENGPAR45","SENGPAR46","SENGPAR47","SENGPAR48","SSAFEMO49","SSAFEMO52","SSAFEMO53","SSAFEMO54","SSAFEMO56","SSAFEMO57",
			"SSAFPSAF60","SSAFPSAF63","SSAFPSAF65","SSAFPSAF67","SSAFPSAF68","SSAFPSAF69","SSAFPSAF71","SSAFBUL74","SSAFBUL75","SSAFBUL76","SSAFBUL77B","SSAFBUL73","SSAFBUL83",
			"SSAFSUB88","SSAFSUB91","SSAFSUB92","SSAFSUB93","SSAFSUB94","SENVPENV100","SENVPENV102","SENVPENV105","SENVPENV106","SENVPENV107",
			"SENVINS111","SENVINS113","SENVINS114","SENVINS115","SENVINS121","SENVMEN130","SENVMEN132","SENVMEN133","SENVMEN134","SENVMEN137",
			"SENVDIS142","SENVDIS143","SENVDIS146","SENVDIS147","SENVDIS147C")
	c1 <- c(-0.88758,-1.21688,-1.70151,-1.13062,-1.59396,-0.9249,-1.6756,-1.37546,-1.19418,-1.35606,-1.32961,-0.77372,-1.21581,-1.45891,-1.02078,-1.15759,-0.83518,-1.57761,-1.54687,-1.19732,-0.79951,-1.15283,-0.652,-1.1754,-1.21302,-1.40832,-1.53905,-1.71611,-0.94029,-0.58,-1.03096,-0.72259,-0.98061,-1.18313,-0.94225,-1.00375,-0.83535,-0.47988,-1.2565,-1.28382,-1.4722,-1.09243,-1.04787,-0.22565,-0.83049,-1.56286,-1.58736,-1.27968,-1.47932,-1.53159,-1.49103,-1.57155,-2.2499,-1.45449,-1.49858,-1.42755,-0.20715,-0.59274,-1.81967,-1.40034,-1.56048,-1.03239,-1.03612)
	c2 <- c(-0.21689,-0.27319,-1.0471,-0.11206,-1.06613,0.07381,-0.63246,-0.30377,-1.18031,-1.27296,-0.53273,0.42434,-0.11198,-1.67465,-0.38939,-0.3564,0.19951,-1.54482,-1.21379,-0.08038,0.6924,0.25698,-0.98285,-0.86814,-1.09519,-1.56709,-1.81347,-0.96873,0.35623,0.38108,0.19308,0.15162,-0.46344,-0.71079,-0.13648,-0.29701,0.09477,0.53213,-0.55163,-0.41416,-0.71855,-0.02012,0.09659,0.46013,0.48755,-0.9234,-0.77854,0.15582,-0.62922,-0.99408,-0.29674,-1.21026,-2.46508,-1.00195,-1.06141,-0.88773,1.33389,0.61958,-1.84193,-0.55586,-0.52414,-0.47264,-0.48429)
	c3 <- c(1.51974,1.64389,1.90232,1.70962,1.54944,2.41102,2.09221,2.13021,1.69008,1.58491,2.16515,2.91955,2.90247,1.27887,1.40529,1.2542,2.39614,0.97705,1.67163,3.05184,2.64666,2.86166,1.31697,1.59058,1.57176,1.39777,0.00153,0.16629,1.60171,1.49101,1.89466,2.2059,1.23622,1.13569,1.18006,1.47382,2.16486,2.06854,0.49417,0.54856,0.34286,0.61828,0.57692,2.66983,2.48407,2.10358,2.00146,2.34459,1.70855,1.92133,2.11189,1.09009,0.32927,1.46992,1.52817,1.52252,2.7488,2.81182,1.31956,1.72246,1.92085,1.27339,1.82711)
	Rstepvalues <- as.data.frame(rbind(c1,c2,c3))
	names(Rstepvalues) <- c0	

	#Items that are negatively valenced. They are reverse-coded below.
	negative <-  c("SSAFPSAF63", "SSAFPSAF65", "SSAFPSAF67", "SSAFPSAF68", "SSAFPSAF69", "SSAFPSAF71", "SSAFBUL74", "SSAFBUL75", "SSAFBUL76", "SSAFBUL77B",
	"SSAFBUL73", "SSAFBUL83","SSAFSUB88","SSAFSUB91","SSAFSUB92","SSAFSUB93","SSAFSUB94")

	#Loop through the scale list
	for (scalename in scalelist) {
	
	  #Stepvalues for the items of the current scale
	  stepvalues <- Rstepvalues[eval(as.name(scalename))]
	  items <- names(stepvalues)
	  
	  #Only use the variables that belong to scales
	  tempdata <- indata[c0]
	  
	  #Code negative values to missing
	  tempdata[tempdata < 0] <- NA
	  
	  #Reverse code negatively valenced items
	  tempdata[negative] <- 5-tempdata[negative]
	  
	  #Use only the variables of the current scale
	  tempdata <- tempdata[items]
	  
	  #Recode values from the range of 1-4 to 0-3
	  tempdata <- tempdata[]-1
	  
	  #The algorithm below is based on a Rasch partial credit model. 
	  #See section 3.1 of the national benchmarking documentation referenced above for more details.
	  #Further reading: http://www.winsteps.com/a/Linacre-estimation-further-topics.pdf 
		tdata <- tempdata
		tdata$counts = apply(tdata,1,function(x) sum(!is.na(x[])))
		titems <- names(stepvalues)
		tstepvalues <- stepvalues
		tv1 <- paste("tv1", titems, sep="")
		tv2 <- paste("tv2", titems, sep="")
		tv3 <- paste("tv3", titems, sep="")
		tdata[tv1] <- stepvalues[1,]
		tdata[tv2] <- stepvalues[2,]
		tdata[tv3] <- stepvalues[3,]
		nm <- paste("nm", titems, sep="")
		tdata[nm] <- tdata[titems]*0+1
		tdata$anm <- rowSums(tdata[nm],na.rm=TRUE)
		tdata[tv1] <- tdata[tv1]*tdata[nm]
		tdata[tv2] <- tdata[tv2]*tdata[nm]
		tdata[tv3] <- tdata[tv3]*tdata[nm]
		tdata$rl <- tdata$anm*0
		tdata$ru <- tdata$anm*3
		tdata$ru[tdata$ru==0] <- 1
		tdata$r <- rowSums(tdata[items],na.rm=TRUE)
		tdata$r[tdata$r==tdata$rl] <- tdata$rl[tdata$r==tdata$rl]+0.3
		tdata$r[tdata$r==tdata$ru] <- tdata$ru[tdata$r==tdata$ru]-0.3
		d <- mean(colMeans(tstepvalues[],na.rm=TRUE))
		tdata$t0 <- d + log((tdata$r-tdata$rl)/(tdata$ru-tdata$r))
		tdata$t <- 0
		tdata$t1 <- 1
		iter <- 1
		while(max(abs(tdata$t1-tdata$t),na.rm=TRUE)>0.0000001 & iter <=100) {
			if (iter==1) {tdata$t <- tdata$t0
				} else if (iter>1) {tdata$t <- tdata$t1}
			iter <- iter+1
			for (n in 1:ncol(tstepvalues)) { 
				jp <- paste("jp", titems[n], sep="")
				j2p <- paste("j2p", titems[n], sep="")
				dj <- paste("dj", titems[n], sep="")
				tdata$p1 <- exp(1*tdata$t-tdata[,tv1[n]])/
					(1+exp(1*tdata$t-tdata[,tv1[n]])+exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])+
					exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]]))
				tdata$p2 <- exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])/
					(1+exp(1*tdata$t-tdata[,tv1[n]])+exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])+
					exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]]))
				tdata$p3 <- exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]])/
					(1+exp(1*tdata$t-tdata[,tv1[n]])+exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])+
					exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]]))
				tdata$p1[is.na(tdata[,tv1[n]])] <- NA
				tdata$p2[is.na(tdata[,tv1[n]])] <- NA
				tdata$p3[is.na(tdata[,tv1[n]])] <- NA
				tdata[jp] <- 1*tdata$p1+2*tdata$p2+3*tdata$p3
				tdata[j2p] <- 1*1*tdata$p1+2*2*tdata$p2+3*3*tdata$p3
				tdata[dj] <- tdata[j2p]-tdata[jp]*tdata[jp]
			}
			tdata$e <- rowSums(tdata[, grep("jp", names(tdata))],na.rm = TRUE)
			tdata$v <- rowSums(tdata[, grep("dj", names(tdata))],na.rm = TRUE)
			tdata$t1 <- tdata$t+(tdata$r-tdata$e)/pmax(2*tdata$v,1,na.rm = TRUE)
		}
		
	#Set to missing if the algorithm did not converge
	tdata$t1[tdata$t1>30] <- NA
	
	#Only calculate score scores for those valid responses to at least three items and more than half of the items 
	tdata$t1[tdata$counts<3 | tdata$counts<0.5*ncol(tempdata)] <- NA
	
	##########SCALING FACTORS######################
	#These factors are used to recale the scale scores so that the cut scores are 300 and 400 for all scales.
	#This is a linear transformation so it does not affect the relative distance between two scores: The distance is inflated by a constant factor B.
	#See section 3.3 the national benchmarking documentation referenced above for more details.
	if (scalename == 'eng' ) {
	  A <- 325.859
	  B <- 40.557
	}
	if (scalename == 'clc' ) {
	  A <- 325.697
	  B <- 44.614
	}
	if (scalename == 'rel' ) {
	  A <- 322.89
	  B <- 36.517
	}
	if (scalename == 'par' ) {
	  A <- 332.549
	  B <- 44.683
	}
	if (scalename == 'saf' ) {
	  A <- 320.161
	  B <- 60.027
	}
	if (scalename == 'emo' ) {
	  A <- 315.079
	  B <- 39.663
	}
	if (scalename == 'psaf' ) {
	  A <- 328.929
	  B <- 59.402
	}
	if (scalename == 'bul' ) {
	  A <- 310.828
	  B <- 58.872
	}
	if (scalename == 'sub' ) {
	  A <- 338.85
	  B <- 119.933
	}
	if (scalename == 'env' ) {
	  A <- 326.967
	  B <- 40.341
	}
	if (scalename == 'penv' ) {
	  A <- 307.241
	  B <- 40.106
	}
	if (scalename == 'ins' ) {
	  A <- 344.491
	  B <- 38.992
	}
	if (scalename == 'men' ) {
	  A <- 318.995
	  B <- 41.928
	}
	if (scalename == 'dis' ) {
	  A <- 333.135
	  B <- 41.56
	}
	
	#Rescale the scores based on the factors above
	tdata$tscore <- round2(tdata$t1*B+A,0)
	
	#Keep only the calculated scale scores
	score_data <- tdata[c("tscore")]
	
	#Rename the data using the scale name and save it to the workspace  
	colnames(score_data) <- paste(scalename, colnames(score_data), sep = "_")
	assign(scalename,score_data,envir = .GlobalEnv)
	}
}

##########SPECIFICATION AND OUTPUT######################
#Read in data
indata <- read.table(file=sprintf("%s\\%s", folder, dataname),header=TRUE,sep=",")
names(indata) <- toupper(names(indata))

#Item names for each scale
eng <- c("SENGCLC1","SENGCLC2","SENGCLC3","SENGCLC4","SENGCLC7","SENGREL9","SENGREL11","SENGREL12","SENGREL14","SENGREL153","SENGREL17","SENGREL20","SENGREL21","SENGREL29",
	"SENGPAR44","SENGPAR45","SENGPAR46","SENGPAR47","SENGPAR48")
clc <- c("SENGCLC1", "SENGCLC2", "SENGCLC3", "SENGCLC4", "SENGCLC7")
rel <- c("SENGREL9","SENGREL11","SENGREL12","SENGREL14","SENGREL153","SENGREL17","SENGREL20","SENGREL21","SENGREL29")
par <- c("SENGPAR44","SENGPAR45","SENGPAR46","SENGPAR47","SENGPAR48")
saf <- c("SSAFEMO49","SSAFEMO52","SSAFEMO53","SSAFEMO54","SSAFEMO56","SSAFEMO57","SSAFPSAF60","SSAFPSAF63","SSAFPSAF65","SSAFPSAF67","SSAFPSAF68","SSAFPSAF69","SSAFPSAF71",
	"SSAFBUL74","SSAFBUL75","SSAFBUL76","SSAFBUL77B","SSAFBUL73","SSAFBUL83","SSAFSUB88","SSAFSUB91","SSAFSUB92","SSAFSUB93","SSAFSUB94")
emo <- c("SSAFEMO49", "SSAFEMO52", "SSAFEMO53", "SSAFEMO54", "SSAFEMO56", "SSAFEMO57")
psaf <- c("SSAFPSAF60", "SSAFPSAF63", "SSAFPSAF65", "SSAFPSAF67", "SSAFPSAF68", "SSAFPSAF69", "SSAFPSAF71")
bul <- c("SSAFBUL74","SSAFBUL75","SSAFBUL76","SSAFBUL77B","SSAFBUL73","SSAFBUL83")
sub <- c("SSAFSUB88","SSAFSUB91","SSAFSUB92","SSAFSUB93","SSAFSUB94")
env <- c("SENVPENV100","SENVPENV102","SENVPENV105","SENVPENV106","SENVPENV107","SENVINS111","SENVINS113","SENVINS114","SENVINS115","SENVINS121","SENVMEN130","SENVMEN132",
	"SENVMEN133","SENVMEN134","SENVMEN137","SENVDIS142","SENVDIS143","SENVDIS146","SENVDIS147","SENVDIS147C")
penv <- c("SENVPENV100","SENVPENV102","SENVPENV105","SENVPENV106","SENVPENV107")
ins <- c("SENVINS111","SENVINS113","SENVINS114","SENVINS115","SENVINS121")
men <- c("SENVMEN130", "SENVMEN132", "SENVMEN133", "SENVMEN134", "SENVMEN137")
dis <- c("SENVDIS142","SENVDIS143","SENVDIS146","SENVDIS147","SENVDIS147C")

#Run scaling program for each scale
rasch(scalelist=c("eng","clc","rel","par","saf","emo","psaf","bul","sub","env","penv","ins","men","dis"))

#Attach the scale score data to the input data
out <- cbind(indata,eng,clc,rel,par,saf,emo,psaf,bul,sub,env,penv,ins,men,dis)

#Exclude cases if they did not provide any demographic information
for (v in c("eng","clc","rel","par","saf","emo","psaf","bul","sub","env","penv","ins","phea","men","dis")) {
  out[which((is.na(out$SDEMO148) | out$SDEMO148 == "" | out$SDEMO148 == -1) & (is.na(out$SDEMO149) | out$SDEMO149 == "" | out$SDEMO149 == -1) & (is.na(out$SDEMO150) | out$SDEMO150 == "" | out$SDEMO150 == -1)  & (is.na(out$SDEMO151) | out$SDEMO151 == "" | out$SDEMO151 == -1)),paste(v,"_tscore",sep="")] <- NA
}

##########OUTPUT DATASET WITH SCALE SCORES###############
write.table(out, sprintf("%s\\scale_score_%s", folder, dataname), col.names=T,row.names=F,sep=",")

##########END OF PROGRAM#################################