##########INSTRUCTION###############################
#You only need to specify your data folder and csv dataset name in the "INPUTS" section below
#The output dataset will be produced in the folder you specify
#Comment: This program calculates scale scores for each survey taker. The calculated scale scores are just 
#         additional analysis variables attached to your input dataset. If your survey has a complex sample
#         design (e.g., you took a sample from your target population with stratification, clustering, etc.),
#         you should apply your design specifications (strata, cluster, weights, etc.) as you would do when 
#         analyzing other survey variables.
####################################################

##########INPUTS###############################
#Specify your data folder and csv dataset name
folder <- "FOLDER PATH"
dataname <- "noninstructional staff data.csv"

##########SCALING PROGROM######################
#Option for decimal places
options(digits=10)

#Rounding function that rounds .5 up to the next integer
round2 = function(x, n) {
  posneg = sign(x)
  z = abs(x)*10^n
  z = z + 0.5
  z = trunc(z)
  z = z/10^n
  z*posneg
}

#Scale score calculation function
rasch <- function(scalelist=NULL){

  #Stepvalues that are estimated from the national benchmarking. These are the values in Table A6 of the national benchmarking documentation
  #(https://safesupportivelearning.ed.gov/sites/default/files/EDSCLS_Psychometric_Benchmarking_Technical_Report.pdf).
  #These three sets of values are used to calculate the scale scores for any survey takers. 
  c0 <- c("NENGCLC2","NENGCLC3","NENGCLC4","NENGCLC6","NENGCLC7","NENGCLC8","NENGREL16","NENGREL17","NENGREL18","NENGREL24","NENGREL25","NENGREL30",
	        "NENGPAR34","NENGPAR37","NENGPAR38","NENGPAR44","NENGPAR47","NSAFEMO51","NSAFEMO52","NSAFEMO53","NSAFEMO54","NSAFEMO55","NSAFEMO148",
	        "NSAFPSAF57","NSAFPSAF58","NSAFPSAF59","NSAFPSAF61","NSAFPSAF63","NSAFPSAF64","NSAFBUL65","NSAFBUL66","NSAFBUL70","NSAFBUL76","NSAFBUL77",
	        "NSAFBUL78","NSAFBUL79","NSAFSUB83","NSAFSUB84","NSAFSUB85","NSAFSUB87","NSAFSUB88","NENVPENV97","NENVPENV98","NENVPENV99","NENVPENV100",
	        "NENVPENV102","NENVPENV103","NENVINS109","NENVINS110","NENVINS111","NENVINS140","NENVPHEA115","NENVPHEA117","NENVPHEA118",
	        "NENVPHEA119","NENVMEN122","NENVMEN125","NENVMEN126","NENVMEN127","NENVDIS130","NENVDIS131","NENVDIS132","NENVDIS134","NENVDIS134C",
	        "NENVDIS135","NENVDIS136","NENVDIS137")
	c1 <- c(-2.41269,-2.90456,-2.54538,-2.44946,-2.37193,-2.71602,-2.51626,-3.51999,-2.47435,-3.24542,-2.48313,-3.09321,-1.81246,-2.66216,-2.32825,-3.02065,-3.21406,-1.0321,-0.96298,-0.91493,-0.99467,-1.74572,-2.7771,-0.90158,-2.04704,-1.87806,-3.11085,-1.92372,-0.52093,-1.08067,-0.96739,-1.99696,-1.42134,-1.39773,-1.99819,-0.85593,-1.4652,-1.7502,-1.95507,-1.37448,-1.48793,-1.31365,-2.04206,-1.80633,-1.83406,-1.66769,-2.25032,-2.80502,-3.03108,-1.45675,-3.53932,-2.20742,-2.31603,-2.86615,-2.70978,-1.49341,-2.46824,-1.7135,-2.22951,-2.26218,-2.98917,-2.50782,-0.79484,-0.72395,-3.00002,-0.93667,-1.65967)
	c2 <- c(-0.62783,-0.93949,-0.35921,-1.91379,-0.49578,-0.90214,-1.33325,-0.57221,-0.64719,-1.08817,-0.03373,-0.94521,0.2645,0.20785,0.96649,-0.16654,-1.66755,-1.26082,-0.37968,-0.43057,-0.8671,-1.55043,-0.80514,0.26985,-0.5574,-0.67698,-1.76912,-1.89623,0.55635,0.66543,1.10514,-0.93873,-2.01817,-2.26507,-2.08219,-2.28479,-0.30073,-0.21784,-0.61555,-0.59265,-0.14093,-0.75334,-1.53466,-1.47464,-0.83077,-1.64181,-1.1453,-1.10456,-0.14643,-0.85192,-1.21007,-0.51886,-0.76917,-1.32258,-0.54017,-0.79424,0.31508,-0.05805,-0.99588,-1.08258,-2.15103,-1.84725,0.24373,-0.00727,-0.59234,-0.10022,-1.39865)
	c3 <- c(2.01573,2.21546,3.94359,2.20042,3.0729,3.1253,3.44854,3.47261,3.50871,2.71541,3.31846,3.72407,3.79969,3.80338,5.37724,4.01239,2.26929,1.70307,2.01959,1.7704,1.72993,2.12222,2.22456,3.05953,2.66121,2.70934,1.21185,1.02633,2.64525,4.41386,3.90358,2.38629,1.17343,1.03809,0.92795,0.98601,2.99407,3.24186,3.06233,3.20019,3.40319,2.46686,2.51556,2.54883,2.84167,2.19677,2.43437,2.80337,3.53325,2.6866,2.43721,3.27185,2.94548,3.29168,3.54043,3.55629,4.24422,4.06839,2.78672,2.67435,2.0923,2.35542,2.72603,3.01571,3.61033,3.33175,2.96215)
	Rstepvalues <- as.data.frame(rbind(c1,c2,c3))
	names(Rstepvalues) <- c0	

	#Items that are negatively valenced. They are reverse-coded below.
	negative <-  c("NSAFPSAF57","NSAFPSAF58","NSAFPSAF59","NSAFPSAF61","NSAFPSAF63","NSAFPSAF64","NSAFBUL65","NSAFBUL66","NSAFBUL76","NSAFBUL77",
	               "NSAFBUL78","NSAFBUL79","NENVPENV97","NENVPENV98","NENVPENV99","NENVPENV100")

	#Loop through the scale list
	for (scalename in scalelist) {
	
	  #Stepvalues for the items of the current scale
	  stepvalues <- Rstepvalues[eval(as.name(scalename))]
	  items <- names(stepvalues)
	  
	  #Only use the variables that belong to scales
	  tempdata <- indata[c0]
	  
	  #Code negative values to missing
	  tempdata[tempdata < 0] <- NA
	  
	  #Reverse code negatively valenced items
	  tempdata[negative] <- 5-tempdata[negative]
	  
	  #Use only the variables of the current scale
	  tempdata <- tempdata[items]
	  
	  #Recode values from the range of 1-4 to 0-3
	  tempdata <- tempdata[]-1
	  
	  #The algorithm below is based on a Rasch partial credit model. 
	  #See section 3.1 of the national benchmarking documentation referenced above for more details.
	  #Further reading: http://www.winsteps.com/a/Linacre-estimation-further-topics.pdf 
	  tdata <- tempdata
		tdata$counts = apply(tdata,1,function(x) sum(!is.na(x[])))
		titems <- names(stepvalues)
		tstepvalues <- stepvalues
		tv1 <- paste("tv1", titems, sep="")
		tv2 <- paste("tv2", titems, sep="")
		tv3 <- paste("tv3", titems, sep="")
		tdata[tv1] <- stepvalues[1,]
		tdata[tv2] <- stepvalues[2,]
		tdata[tv3] <- stepvalues[3,]
		nm <- paste("nm", titems, sep="")
		tdata[nm] <- tdata[titems]*0+1
		tdata$anm <- rowSums(tdata[nm],na.rm=TRUE)
		tdata[tv1] <- tdata[tv1]*tdata[nm]
		tdata[tv2] <- tdata[tv2]*tdata[nm]
		tdata[tv3] <- tdata[tv3]*tdata[nm]
		tdata$rl <- tdata$anm*0
		tdata$ru <- tdata$anm*3
		tdata$ru[tdata$ru==0] <- 1
		tdata$r <- rowSums(tdata[items],na.rm=TRUE)
		tdata$r[tdata$r==tdata$rl] <- tdata$rl[tdata$r==tdata$rl]+0.3
		tdata$r[tdata$r==tdata$ru] <- tdata$ru[tdata$r==tdata$ru]-0.3
		d <- mean(colMeans(tstepvalues[],na.rm=TRUE))
		tdata$t0 <- d + log((tdata$r-tdata$rl)/(tdata$ru-tdata$r))
		tdata$t <- 0
		tdata$t1 <- 1
		iter <- 1
		while(max(abs(tdata$t1-tdata$t),na.rm=TRUE)>0.0000001 & iter <=100) {
			if (iter==1) {tdata$t <- tdata$t0
				} else if (iter>1) {tdata$t <- tdata$t1}
			iter <- iter+1
			for (n in 1:ncol(tstepvalues)) { 
				jp <- paste("jp", titems[n], sep="")
				j2p <- paste("j2p", titems[n], sep="")
				dj <- paste("dj", titems[n], sep="")
				tdata$p1 <- exp(1*tdata$t-tdata[,tv1[n]])/
					(1+exp(1*tdata$t-tdata[,tv1[n]])+exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])+
					exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]]))
				tdata$p2 <- exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])/
					(1+exp(1*tdata$t-tdata[,tv1[n]])+exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])+
					exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]]))
				tdata$p3 <- exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]])/
					(1+exp(1*tdata$t-tdata[,tv1[n]])+exp(2*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]])+
					exp(3*tdata$t-tdata[,tv1[n]]-tdata[,tv2[n]]-tdata[,tv3[n]]))
				tdata$p1[is.na(tdata[,tv1[n]])] <- NA
				tdata$p2[is.na(tdata[,tv1[n]])] <- NA
				tdata$p3[is.na(tdata[,tv1[n]])] <- NA
				tdata[jp] <- 1*tdata$p1+2*tdata$p2+3*tdata$p3
				tdata[j2p] <- 1*1*tdata$p1+2*2*tdata$p2+3*3*tdata$p3
				tdata[dj] <- tdata[j2p]-tdata[jp]*tdata[jp]
			}
			tdata$e <- rowSums(tdata[, grep("jp", names(tdata))],na.rm = TRUE)
			tdata$v <- rowSums(tdata[, grep("dj", names(tdata))],na.rm = TRUE)
			tdata$t1 <- tdata$t+(tdata$r-tdata$e)/pmax(2*tdata$v,1,na.rm = TRUE)
		}
	
	#Set to missing if the algorithm did not converge
	tdata$t1[tdata$t1>30] <- NA
	
	#Only calculate score scores for those valid responses to at least three items and more than half of the items 
	tdata$t1[tdata$counts<3 | tdata$counts<0.5*ncol(tempdata)] <- NA
	
	##########SCALING FACTORS######################
	#These factors are used to recale the scale scores so that the cut scores are 300 and 400 for all scales.
	#This is a linear transformation so it does not affect the relative distance between two scores: The distance is inflated by a constant factor B.
	#See section 3.3 the national benchmarking documentation referenced above for more details.
	if (scalename == 'eng' ) {
	  A <- 316.027
	  B <- 25.661
	}
	if (scalename == 'clc' ) {
	  A <- 324.047
	  B <- 27.686
	}
	if (scalename == 'rel' ) {
	  A <- 318.803
	  B <- 24.114
	}
	if (scalename == 'par' ) {
	  A <- 300.916
	  B <- 25.772
	}
	if (scalename == 'saf' ) {
	  A <- 328.465
	  B <- 31.578
	}
	if (scalename == 'emo' ) {
	  A <- 332.304
	  B <- 35.115
	}
	if (scalename == 'psaf' ) {
	  A <- 327.846
	  B <- 32.555
	}
	if (scalename == 'bul' ) {
	  A <- 342.271
	  B <- 32.014
	}
	if (scalename == 'sub' ) {
	  A <- 310.583
	  B <- 28.119
	}
	if (scalename == 'env' ) {
	  A <- 323.636
	  B <- 25.919
	}
	if (scalename == 'penv' ) {
	  A <- 333.383
	  B <- 26.649
	}
	if (scalename == 'ins' ) {
	  A <- 322.626
	  B <- 27.086
	}
	if (scalename == 'phea') {
	  A <- 319.552
	  B <- 24.661
	}
	if (scalename == 'men' ) {
	  A <- 309.507
	  B <- 24.669
	}
	if (scalename == 'dis' ) {
	  A <- 326.977
	  B <- 25.694
	}
	
	#Rescale the scores based on the factors above
	tdata$tscore <- round2(tdata$t1*B+A,0)
	
	#Keep only the calculated scale scores
	score_data <- tdata[c("tscore")]
	
	#Rename the data using the scale name and save it to the workspace  
	colnames(score_data) <- paste(scalename, colnames(score_data), sep = "_")
	assign(scalename,score_data,envir = .GlobalEnv)
	}
}

##########SPECIFICATION AND OUTPUT######################
#Read in data
indata <- read.table(file=sprintf("%s\\%s", folder, dataname),header=TRUE,sep=",")
names(indata) <- toupper(names(indata))

#Item names for each scale
eng <- c("NENGCLC2","NENGCLC3","NENGCLC4","NENGCLC6","NENGCLC7","NENGCLC8","NENGREL16","NENGREL17","NENGREL18","NENGREL24","NENGREL25","NENGREL30",
         "NENGPAR34","NENGPAR37","NENGPAR38","NENGPAR44","NENGPAR47")
clc <- c("NENGCLC2","NENGCLC3","NENGCLC4","NENGCLC6","NENGCLC7","NENGCLC8")
rel <- c("NENGREL16","NENGREL17","NENGREL18","NENGREL24","NENGREL25","NENGREL30")
par <- c("NENGPAR34","NENGPAR37","NENGPAR38","NENGPAR44","NENGPAR47")
saf <- c("NSAFEMO51","NSAFEMO52","NSAFEMO53","NSAFEMO54","NSAFEMO55","NSAFEMO148",
         "NSAFPSAF57","NSAFPSAF58","NSAFPSAF59","NSAFPSAF61","NSAFPSAF63","NSAFPSAF64","NSAFBUL65","NSAFBUL66","NSAFBUL70","NSAFBUL76","NSAFBUL77",
         "NSAFBUL78","NSAFBUL79","NSAFSUB83","NSAFSUB84","NSAFSUB85","NSAFSUB87","NSAFSUB88")
emo <- c("NSAFEMO51","NSAFEMO52","NSAFEMO53","NSAFEMO54","NSAFEMO55","NSAFEMO148")
psaf <- c("NSAFPSAF57","NSAFPSAF58","NSAFPSAF59","NSAFPSAF61","NSAFPSAF63","NSAFPSAF64")
bul <- c("NSAFBUL65","NSAFBUL66","NSAFBUL70","NSAFBUL76","NSAFBUL77","NSAFBUL78","NSAFBUL79")
sub <- c("NSAFSUB83","NSAFSUB84","NSAFSUB85","NSAFSUB87","NSAFSUB88")
env <- c("NENVPENV97","NENVPENV98","NENVPENV99","NENVPENV100",
         "NENVPENV102","NENVPENV103","NENVINS109","NENVINS110","NENVINS111","NENVINS140","NENVPHEA115","NENVPHEA117","NENVPHEA118",
         "NENVPHEA119","NENVMEN122","NENVMEN125","NENVMEN126","NENVMEN127","NENVDIS130","NENVDIS131","NENVDIS132","NENVDIS134","NENVDIS134C",
         "NENVDIS135","NENVDIS136","NENVDIS137")
penv <- c("NENVPENV97","NENVPENV98","NENVPENV99","NENVPENV100","NENVPENV102","NENVPENV103")
ins <- c("NENVINS109","NENVINS110","NENVINS111","NENVINS140")
phea <- c("NENVPHEA115","NENVPHEA117","NENVPHEA118","NENVPHEA119")
men <- c("NENVMEN122","NENVMEN125","NENVMEN126","NENVMEN127")
dis <- c("NENVDIS130","NENVDIS131","NENVDIS132","NENVDIS134","NENVDIS134C","NENVDIS135","NENVDIS136","NENVDIS137")

#Run scaling program for each scale
rasch(scalelist=c("eng","clc","rel","par","saf","emo","psaf","bul","sub","env","penv","ins","phea","men","dis"))

#Attach the scale score data to the input data
out <- cbind(indata,eng,clc,rel,par,saf,emo,psaf,bul,sub,env,penv,ins,phea,men,dis)

#Exclude cases if they did not provide any demographic information
for (v in c("eng","clc","rel","par","saf","emo","psaf","bul","sub","env","penv","ins","phea","men","dis")) {
  out[which((is.na(out$NDEMO142) | out$NDEMO142 == "" | out$NDEMO142 == -1) & (is.na(out$NDEMO143) | out$NDEMO143 == "" | out$NDEMO143 == -1) & (is.na(out$NDEMO144) | out$NDEMO144 == "" | out$NDEMO144 == -1)),paste(v,"_tscore",sep="")] <- NA
}

##########OUTPUT DATASET WITH SCALE SCORES###############
write.table(out, sprintf("%s\\scale_score_%s", folder, dataname), col.names=T,row.names=F,sep=",")

##########END OF PROGRAM#################################