************************************************************************ * Fall 1994 exercises with grading macros in Splus * ************************************************************************ Note: References in the macros are to Cochran and to Sarndal. The file district.dat is included. These values are correct if ISEED is set to 2957585 in the BLOCK DATA module of SURVEY. ****************file district.dat for ISEED=2957585********************* 1 142 142 549 66448 2 153 295 609 60122 3 135 430 514 65797 4 128 558 477 56084 5 110 668 431 56592 6 103 771 418 58795 7 105 876 374 69629 8 385 1261 1386 77155 9 296 1557 1059 73605 10 287 1844 1002 69039 11 253 2097 905 58635 12 172 2269 660 55189 13 198 2467 759 64406 14 432 2899 1548 76384 15 248 3147 964 69636 16 251 3398 922 53881 17 221 3619 858 67928 18 297 3916 1059 78802 19 235 4151 860 71418 20 171 4322 641 53214 21 135 4457 517 67860 22 254 4711 893 64890 23 203 4914 763 77586 24 244 5158 1013 82727 25 202 5360 741 68232 26 103 5463 401 55069 27 102 5565 407 61346 28 115 5680 438 60559 29 180 5860 694 67770 30 190 6050 817 70615 31 152 6202 548 66993 32 141 6343 555 58621 33 143 6486 588 57266 34 135 6621 481 57311 35 178 6799 661 59424 36 221 7020 849 62233 37 174 7194 654 53194 38 101 7295 317 49778 39 95 7390 355 57749 40 130 7520 477 57899 41 152 7672 613 56030 42 169 7841 712 58756 43 91 7932 352 50803 44 283 8215 1036 60249 45 562 8777 1979 57333 46 312 9089 1108 52056 47 897 9986 3203 62436 48 734 10720 2617 61524 49 963 11683 3462 60091 50 642 12325 2280 55114 51 525 12850 1869 95235 52 726 13576 2513 68411 53 674 14250 1931 54494 54 585 14835 1203 48185 55 553 15388 1111 42886 56 583 15971 2084 95676 57 911 16882 2717 84813 58 1051 17933 2450 56963 59 918 18851 1832 36610 60 799 19650 1610 44400 61 545 20195 1891 101460 62 895 21090 2768 75407 63 1313 22403 2772 56079 64 968 23371 2418 62998 65 717 24088 2239 69795 66 651 24739 1692 91677 67 886 25625 2846 82857 68 912 26537 2911 77141 69 898 27435 2647 72167 70 759 28194 2569 80097 71 722 28916 2512 86883 72 753 29669 2701 79953 73 793 30462 2737 79166 74 725 31187 2139 82016 75 802 31989 2845 80879 *******************assignment 1***************************************** Assignment 1: Simple random sampling 1. Use the addgen and survey programs to obtain the answers to the questionaire for a simple random sample of 200 addresses from Lockhart City. In your writeup provide the seed number you used in addgen. Estimate the following from your sample. Give standard errors for your estimates: a. The average price a household in Lockhart City is willing to pay for cable TV service. Actually we only know for each sampled household the price it is willing to pay for service rounded down to the nearest $5. Recognizing this limitation to question 4 of the survey questionnaire, use the answers to that question as the prices that the sampled houses are willing to pay. b. The average number of TV's per household. c. The proportion of houses willing to pay $10 for cable service. This really means, of course, at least $10. d. The average number of hours spent per week watching TV in households willing to pay $10 for cable service. e. The total number of adults in households willing to pay $10 for cable service. 2. Using a suitable auxiliary variable repeat problems 1a and 1b with a ratio estimator. Given two estimates ybar1 and ybar2 of YBAR derived from the same sample, the relative precision of ybar1 to ybar2 is Var(ybar2)/Var(ybar1). Equivalently, if sample sizes n1 and n2 are required to achieve the same standard error using ybar1 and ybar2, respectively, the relative precision is n2/n1. Estimate from your sample the relative precision of of the ratio in (4a,b) to the sample mean estimate (in 3a,b) for both the average number of TV's in a household in Lockhart City and the average price a household is willing to pay for cable TV. IMPORTANT: How are your results related to the survey program assumptions. #assignment 1 grading program data<-matrix(scan("sample.dat"),byrow=T,ncol=12) distdat<-matrix(scan("district.dat"),byrow=T,ncol=5) if (min(data[,1])<50.5) {print("This is not a sample of Lockhart City.")} n<-dim(data)[1] N<-19664 print(c("sample size:",n)) price<-data[,7] f<-n/N print("Problem 1:") se<-sqrt((1-f)*var(price)/n) print(c("average price:",mean(price),se)) ntv<-data[,6] se<-sqrt((1-f)*var(ntv)/n) print(c("number of tv's:",mean(ntv),se)) pay<-ifelse(price>9.9,1,0) se<-sqrt((1-f)*var(pay)/n) print(c("proportion willing to pay $10:",mean(pay),se)) nhours<-data[,8] nhours<-nhours*pay rhat<-sum(nhours)/sum(pay) w<-(nhours-rhat*pay)/mean(pay) se<-sqrt((1-f)*var(w)/n) print(c("average number of TV hours (houses willing to pay $10):",rhat,se)) nadult<-data[,4] nadult<-nadult*pay that<-N*mean(nadult) se<-N*sqrt((1-f)*var(nadult)/n) print(c("total number of adults (houses willing to pay $10):",that,se)) print("Problem 2:") print("Which auxiliary variable did you use?") print(c("1. district number","2. district number of houses", "3. cumulative house count","4. district population", "5. district mean assessed house value","6. district mean household size", "9. none of the above")) print("enter the number of your auxiliary variable") reply<-readline() if (nchar(reply)>0) {reply<-as.integer(reply) if ((reply<=6)&(reply>=1)) {aux<-data[,1] if (reply==1) { auxbar<-sum(distdat[51:75,1]*distdat[51:75,2])/sum(distdat[51:75,2])} if (reply==2) {aux<-distdat[aux,2] auxbar<-1} if (reply==3) {aux<-distdat[aux,3] auxbar<-sum(distdat[51:75,3]*distdat[51:75,2])/sum(distdat[51:75,2])} if (reply==4) {aux<-distdat[aux,4] auxbar<-sum(distdat[51:75,4]*distdat[51:75,2])/sum(distdat[51:75,2])} if (reply==5) {aux<-distdat[aux,5] auxbar<-sum(distdat[51:75,5]*distdat[51:75,2])/sum(distdat[51:75,2])} if (reply==6) {aux<-distdat[aux,4]/distdat[aux,2] auxbar<-sum(distdat[51:75,4])/sum(distdat[51:75,2])} se<-sqrt((1-f)*var(price)/n) rhat<-mean(price)/mean(aux) w<-(price-rhat*aux) se.rat1<-sqrt((1-f)*var(w)/n) se.rat2<-se.rat1*auxbar/mean(aux) print(c("average price, ratio estimate:",rhat*auxbar,se.rat1)) print(c("alternative standard error estimate:",se.rat2)) print(c("relative precision (ratio est. to sample mean est.)",(se/se.rat1)^2,(se/se.rat2)^2)) ntv<-data[,6] se<-sqrt((1-f)*var(ntv)/n) rhat<-mean(ntv)/mean(aux) w<-(ntv-rhat*aux) se.rat1<-sqrt((1-f)*var(w)/n) se.rat2<-se.rat1*auxbar/mean(aux) print(c("number of TV's, ratio estimate:",rhat*auxbar,se.rat1)) print(c("alternative standard error estimate:",se.rat2)) print(c("relative precision (ratio est. to sample mean est.)",(se/se.rat1)^2,(se/se.rat2)^2))}} **************************assignment 2********************************** Assignment 2: Stratification Use any considerations you like to divide Lockhart City into approximately five strata. Then use ADDGEN to generate a stratified random sample of size 200 from Lockhart City with this stratification and proportional allocation. Find the responses using the program SURVEY. Estimate the average price a household in Lockhart City is willing to pay for cable service and the average number of TV's per household in Lockhart City. #assignment 2 grading program #strat is to be a list with strat[[i]] the vector of districts in stratum i #for example, if the strata are 51-65, 67 69 71 73 75, 66 68 70 72 74 #strat can be set up using the Splus instruction #strat<-list(51:65,c(67,69,71,73,75),c(66,68,70,72,74)) #output is n, N, ybar, Sy for each stratum #followed by overall n, N, stratified mean, and standard error of the stratified mean STRATSTAT<-function(N,jstrat,y) { nstrat<-range(jstrat)[2] n<-NULL ybar<-NULL sy<-NULL for (i in 1:nstrat) { n<-c(n,length(jstrat[jstrat==i])) ybar<-c(ybar,mean(y[jstrat==i])) sy<-c(sy,sqrt(var(y[jstrat==i])))} ybar<-c(ybar,sum(N*ybar)/sum(N)) sy<-c(sy,sqrt(sum((1/n-1/N)*sy*sy*N*N))/sum(N)) n<-c(n,sum(n)) N<-c(N,sum(N)) list(N=N,n=n,ybar=ybar,sy=sy)} distdat<-matrix(scan("district.dat"),byrow=T,ncol=5) data<-matrix(scan("sample.dat"),byrow=T,ncol=12) test<-sort(unlist(strat)) if (length(test)!=25) { print("this is not a stratification of Lockhart City") stop} if (sum((test-51:75)^2)!=0) { print("this is not a stratification of Lockhart City") stop} istrat<-rep(0,75) nstrat<-length(strat) N<-NULL for (i in 1:nstrat) { istrat[strat[[i]]]<-i N<-c(N,sum(distdat[strat[[i]],2]))} price<-STRATSTAT(N,istrat[data[,1]],data[,7]) for (i in 1:nstrat) {nh<-price$N[i]*price$n[nstrat+1]/price$N[nstrat+1] if (((nh-price$n[i])^2)>1) {print(c("This is not proportional allocation, stratum ",i)) print(c("Sample size, proportional allocation: ",nh)) print(c("Sample size: ",price$n[i]))}} print("cable price") print(price) tv<-STRATSTAT(N,istrat[data[,1]],data[,6]) print("number of TVs") print(tv) *******************assignment 3***************************************** Assignment 3: Poststratification 1. Take a simple random sample of size 200 from Lockhart City and post stratify it according to house valuation using the following information obtained from the County Tax Assessor: DISTRIBUTION OF RESIDENTIAL ASSESSED VALUATIONS LOCKHART CITY: House Value Number 0 to 39999 1021 40000-49999 1786 50000-59999 2724 60000-69999 2603 70000-79999 4592 80000-89999 4608 90000-99999 1788 100000 and above 542 TOTAL 19664 Estimate the average price a household is willing to pay for Cable TV and the average number of TV's per household. 2. We know the distribution of the number of hours spent watching TV for the U.S. as a whole, given in the table below. DISTRIBUTION OF TV VIEWING TIME--U.S.: Number of hours spent watching TV Proportion of households 0 - 24 23.4% 25 - 49 22.6% 50 - 99 36.4% 100 and more 17.6% Use this information to post-stratify your SRS from problem 1, and compare and critically evaluate your estimates for problems 1 and 2. #Assignment 3 grading macro #output is poststratified mean and standard error #standard error computed using Taylor approximation to order n^(-2) #see Cochran for relevant formula data<-matrix(scan("sample.dat"),byrow=T,ncol=12) hval<-data[,3] cprice<-data[,7] ntv<-data[,6] nhours<-data[,8] n<-dim(data)[1] N<-19664 f<-n/N # #problem 1 print("problem 1") Wh<-c(1021,1786,2724,2603,4592,4608,1788,542)/19664 breaks<-c(40000,50000,60000,70000,80000,90000,100000) nbrk<-length(breaks) istrat<-rep(1,n) for (i in 1:nbrk) istrat<-ifelse((hval>=breaks[i]),i+1,istrat) s2ip<-NULL s2iTV<-NULL ybarip<-NULL ybariTV<-NULL for (i in 1:(nbrk+1)) {ybarip<-c(ybarip,mean(cprice[istrat==i])) ybariTV<-c(ybariTV,mean(ntv[istrat==i])) s2ip<-c(s2ip,var(cprice[istrat==i])) s2iTV<-c(s2iTV,var(ntv[istrat==i]))} ybarp<-sum(Wh*ybarip) s2p<-(1-f)*sum(Wh*s2ip)/n + sum((1-Wh)*s2ip)/(n^2) sep<-sqrt(s2p) print(c("cable price",ybarp,sep)) ybarTV<-sum(Wh*ybariTV) s2TV<-(1-f)*sum(Wh*s2iTV)/n + sum((1-Wh)*s2iTV)/(n^2) seTV<-sqrt(s2TV) print(c("number of TV's",ybarTV,seTV)) # #problem 2 print("problem 2") Wh<-c(.234,.226,.364,.176) breaks<-c(25,50,100) nbrk<-length(breaks) istrat<-rep(1,n) for (i in 1:nbrk) istrat<-ifelse((nhours>=breaks[i]),i+1,istrat) s2ip<-NULL s2iTV<-NULL ybarip<-NULL ybariTV<-NULL for (i in 1:(nbrk+1)) {ybarip<-c(ybarip,mean(cprice[istrat==i])) ybariTV<-c(ybariTV,mean(ntv[istrat==i])) s2ip<-c(s2ip,var(cprice[istrat==i])) s2iTV<-c(s2iTV,var(ntv[istrat==i]))} ybarp<-sum(Wh*ybarip) s2p<-(1-f)*sum(Wh*s2ip)/n + sum((1-Wh)*s2ip)/(n^2) sep<-sqrt(s2p) print(c("cable price",ybarp,sep)) ybarTV<-sum(Wh*ybariTV) s2TV<-(1-f)*sum(Wh*s2iTV)/n + sum((1-Wh)*s2iTV)/(n^2) seTV<-sqrt(s2TV) print(c("number of TV's",ybarTV,seTV)) ***********************assignment 4************************************* Assignment 4: Two stage cluster designs 1. Design a two stage cluster sampling scheme for the rural areas (districts 1-43) of Stephens County which chooses between 25% and 50% of the districts (clusters) with probability proportional to size and with replacement, and has a constant subsample size within each chosen district. The total survey cost should be about $3850 (the approximate cost of a simple random sample of size 100 from the rural areas). Execute the sample and estimate the average price a rural household is willing to pay for cable TV using both an unbiased estimate and a ratio estimate using the house assessed value as an auxiliary variable. Be sure to give standard errors. The remainder of this problem set refers to the unbiased estimator. 2. Take a simple random sample of size 100 from districts 1-43 and use this sample to estimate the average price a rural household is willing to pay for cable TV service (using the sample mean estimate). For future reference in studying a similar population, it is often useful to consider what would have occured if a different sampling technique had been adopted. 3. Estimate from your samples the within cluster variance VSSU and the between cluster variance VPSU for the rural areas (1-43) and design III. What would have been the optimal allocation between PSU's and SSU's for a total expected budget of $3850? 4. I have been able to determine that a simple random sample of size 200 from districts 1-43 would visit about 42 districts and cost about $5720. Are any two stage designs an improvement over simple random sampling to determine the average price a rural household is willing to pay if a budget of $5720 is adopted for sampling? Use the results of problem 3-- you do not need to take a new sample. #Assignment 4 grading macro #cluster sample in file sample.dat #simple random sample in file ruralsrs.dat #vssu is an unbiased estimated of second term of Sarndal et al eq (4.5.2) #thus vtot=vpsu+vssu is an unbiased estimate of the components of (4.5.2) #note: in unfortunate samples it is possible for vpsu<0 #note: when the sample size within clusters n2 (nhsdist below) is constant, # eqn (4.5.2) can be written as vtot = sig2b/mI + sig2w/(mI*n2) # where mI is the number of PSU's (ndist below) # and sig2w = SUM(Ni*S2i/N) where S2i is the variance in cluster i and # SUM is a sum over all PSU's in the population # sig2w/(mI*n2) is vssu except for second stage fpc # an unbiased estimate of sig2w is sum(s2i)/mI where # sum is a sample sum and s2i is the sample variance in cluster i # print("Problem 1") print("Enter number of first stage sampling units (PSU's)") reply<-readline() if (nchar(reply)==0) stop ndist<-as.integer(reply) if (ndist<10) print("At least 10 districts must be sampled.") if (ndist>22) print("At most 22 districts can be sampled.") data<-matrix(scan("sample.dat"),byrow=T,ncol=12) N<-7932 Hval<-65520 idist<-data[,1] ihouse<-data[,2] nhouse<-length(ihouse) jdist<-sort(unique(idist)) nddist<-length(jdist) ahsdist<-round(nhouse/ndist) for (i in 1:nddist) {temp<-length(ihouse[idist==jdist[i]]) nsamp<-round(temp/ahsdist) if ((temp-nsamp*ahsdist)!=0) { print(c("each district must be visited the same number of times",jdist[i],temp))} if (nsamp>1) {temp2<-ifelse(idist==jdist[i],1,0) temp2<-idist+ceiling(cumsum(temp2)/ahsdist)*(.1) idist<-ifelse(idist==jdist[i],temp2,idist)}} nhsdist<-tapply(idist,list(idist),length) hval<-data[,3] cprice<-data[,7] print(c("problem 1: ndist, nhouse, cost",ndist,nhouse,60*ndist+16*nhouse)) print(c("number of houses in each district:",nhsdist)) cmean<-tapply(cprice,list(idist),mean) print(c("district sample mean cable price: ",cmean)) cprice.est<-mean(cmean) vtot<-var(cmean)/ndist se<-sqrt(vtot) print(c("sample mean estimate, standard error",cprice.est,se)) hmean<-tapply(hval,list(idist),mean) hprice.est<-mean(hmean) print(c("district sample mean house value:",hmean)) print(c("sample mean estimate of mean house value, true mean house value:", hprice.est,Hval)) rhat<-cprice.est/hprice.est cprice.ratest<-rhat*Hval se<-sqrt(var(cmean-rhat*hmean)/ndist) print(c("cable price ratio estimate, standard error:",cprice.ratest,se)) # data<-matrix(scan("ruralsrs.dat"),byrow=T,ncol=12) cprice.srs<-mean(data[,7]) s2<-var(data[,7]) se<-sqrt(s2*(1-100/N)/100) print(c("problem 2--simple random sample",cprice.srs,se)) # distdat<-matrix(scan("district.dat"),byrow=T,ncol=5) jdist<-sort(unique(idist)) Ni<-distdat[floor(jdist),2] s2i<-tapply(cprice,list(idist),var) vssu<-sum((1-nhsdist/Ni)*s2i/nhsdist)/(ndist^2) vpsu<-vtot-vssu print(c("problem 3--vtot,vpsu,vssu:",vtot,vpsu,vssu)) sig2w<-mean(s2i) sig2b<-ndist*(vtot-sig2w/sum(nhsdist)) c1<-60 c2<-16 nbar<-sqrt((c1*sig2w)/(c2*sig2b)) n1<-3850/(c1+c2*nbar) print(c("sig2b, sig2w, nbar, nI",sig2b,sig2w,nbar,n1)) # se<-sqrt(s2*(1-200/N)/200) print(c("problem 4--srs n=200, se:",se)) n1<-5720/(c1+c2*nbar) se<-sqrt(sig2b/n1+sig2w/(nbar*n1)) print(c("cluster sample, n1, se",n1,se)) *******************assignment 5***************************************** Assignment 5: Complicated surveys We are interested in estimating the average price a household is willing to pay for cable TV service in Stephens County and separately in the following four strata: STRATUM I: Rural areas, districts 1-43 STRATUM II: Three villages, districts 44-46 STRATUM III: Eavesville, districts 47-50 STRATUM IV: Lockhart City, districts 51-75 It is proposed to make the probability that any given house appears in the sample be approximately 1%. More specifically the design in each stratum will be STRATUM I: 16 districts chosen with probability proportional to size (with replacement), 5 houses sampled in each chosen district, unbiased estimate. STRATUM II: A stratified random sample with 3, 6 and 3 houses chosen in districts 44, 45, and 46 respectively; stratified (unbiased) mean estimates. STRATUM III: Simple random sample size 32 with a ratio to house assessed value estimate. STRATUM IV: Simple random sample size 197 with a poststratified (on house value) estimate. Execute a sample according to the above design and estimate with standard error the average price a household in Stephens County is willing to pay for cable TV service. Also estimate with standard error the average price a household in each of the four strata is willing to pay for service. #Assignment 5 grading macro data1<-matrix(scan("str1.dat"),byrow=T,ncol=12) data2<-matrix(scan("str2.dat"),byrow=T,ncol=12) data3<-matrix(scan("str3.dat"),byrow=T,ncol=12) data4<-matrix(scan("str4.dat"),byrow=T,ncol=12) district.dat<-matrix(scan("district.dat"),byrow=T,ncol=5) # #stratum 1 print("stratum 1") idist<-data1[,1] ihouse<-data1[,2] if (max(idist)>43.1) print("str1.dat is not a data set from districts 1-43") if (length(ihouse)!=80) print("incorrect sample size--stratum 1") jdist<-sort(unique(idist)) for (i in 1:length(jdist)) {temp<-length(ihouse[idist==jdist[i]]) nsamp<-round(temp/5) if ((temp-nsamp*5)!=0) { print(c("each district must be visited 5 times",jdist[i],temp))} if (nsamp>1) {temp2<-ifelse(idist==jdist[i],1,0) temp2<-idist+ceiling(cumsum(temp2)/5)*(.1) idist<-ifelse(idist==jdist[i],temp2,idist)}} ndist<-length(unique(idist)) if (ndist!=16) print(c("incorrect number of districts visited",ndist)) cprice<-data1[,7] cmean<-tapply(cprice,list(idist),mean) print(c("district sample mean cable price: ",cmean)) Nh<-sum(district.dat[1:43,2]) chat<-mean(cmean) vhat<-var(cmean)/ndist se<-sqrt(vhat) print(c("Nh, unbiased pps estimate, standard error",Nh,chat,se)) # #stratum 2 print("stratum 2") STRATSTAT<-function(N,jstrat,y) { nstrat<-range(jstrat)[2] n<-NULL ybar<-NULL sy<-NULL for (i in 1:nstrat) { n<-c(n,length(jstrat[jstrat==i])) ybar<-c(ybar,mean(y[jstrat==i])) sy<-c(sy,sqrt(var(y[jstrat==i])))} ybar<-c(ybar,sum(N*ybar)/sum(N)) sy<-c(sy,sqrt(sum((1/n-1/N)*sy*sy*N*N))/sum(N)) n<-c(n,sum(n)) N<-c(N,sum(N)) list(N=N,n=n,ybar=ybar,sy=sy)} if ((min(data2[,1])<43.9)|(max(data2[,1])>46.1)) { print("str2.dat is not a data set from districts 44-46")} istrat<-rep(0,75) N<-NULL for (i in 1:3) {j<-43+i istrat[j]<-i N<-c(N,district.dat[j,2])} temp<-STRATSTAT(N,istrat[data2[,1]],data2[,7]) if (sum((temp$n-c(3,6,3,12))^2)!=0) print(c("incorrect sample sizes--stratum 2",temp$n)) print(c("Nh, stratified mean estimate, standard error",temp$N[4],temp$ybar[4],temp$sy[4])) Nh<-c(Nh,temp$N[4]) chat<-c(chat,temp$ybar[4]) vhat<-c(vhat,(temp$sy[4]^2)) # #stratum 3 print("stratum 3") if ((min(data3[,1])<46.9)|(max(data3[,1])>50.1)) { print("str3.dat is not a data set from districts 47-50")} n<-dim(data3)[1] if (n!=32) print("incorrect sample size--stratum 3") N<-sum(district.dat[47:50,2]) hval<-data3[,3] cprice<-data3[,7] Hval<-60079 rhat<-mean(cprice)/mean(hval) chat<-c(chat,rhat*Hval) vhat<-c(vhat,(1/n-1/N)*var(cprice-rhat*hval)) se<-sqrt(vhat[3]) print(c("Nh, ratio estimate, standard error",N,chat[3],se)) Nh<-c(Nh,N) # #stratum 4 print("stratum 4") if (min(data4[,1])<50.9) print("str4.dat is not a data set from districts 51-75") n<-dim(data4)[1] if (n!=197) print("incorrect sample size--stratum 4") N<-sum(district.dat[51:75,2]) hval<-data4[,3] cprice<-data4[,7] Wh<-c(1021,1786,2724,2603,4592,4608,1788,542)/19664 breaks<-c(40000,50000,60000,70000,80000,90000,100000) nbrk<-length(breaks) istrat<-rep(1,n) for (i in 1:nbrk) istrat<-ifelse((hval>=breaks[i]),i+1,istrat) s2ip<-NULL ybarip<-NULL for (i in 1:(nbrk+1)) {ybarip<-c(ybarip,mean(cprice[istrat==i])) s2ip<-c(s2ip,var(cprice[istrat==i]))} chat<-c(chat,sum(Wh*ybarip)) vhat<-c(vhat,(1-n/N)*sum(Wh*s2ip)/n + sum((1-Wh)*s2ip)/(n^2)) se<-sqrt(vhat[4]) print(c("Nh, post-stratified mean estimate, standard error",N,chat[4],se)) Nh<-c(Nh,N) # #overall estimate #print("Stephens County") N<-sum(Nh) chat<-sum(Nh*chat)/N vhat<-sum((Nh^2)*vhat)/(N^2) se<-sqrt(vhat) print(c("N, combined estimate, standard error",N,chat,se)) ***********************assignment 6************************************* Assignment 6: Variance Estimation for Complex Surveys For these exercises, draw a simple random sample of size 200 from Lockhart City. We want to estimate R = mu_y/mu_x, the ratio of the price a household is willing to pay for cable TV (Y) to the assessed value of the house (X). 1. Randomly divide your sample into 10 different subsamples, each of size 20. Be sure to explain how you randomly divided your sample. Find the ratio r_i = ybar_i/xbar_i for each group. The r_i are 10 (almost) independent observations; if we use rbar to estimate R, then the estimated variance of rbar is sum (r_i - rbar)^2/90. For the purpose of grading, rearrange your output file from SURVEY so that the first group is the first 20 observations, the second group is the next 20 observations, etc. 2. Calculate 200 different estimates of R, each using all but one of the 200 data points. One way of doing the calculations is to define two new variables, SUMX and SUMY, to be the sum of all 200 observations for X and Y, respectively. Then the variable YJACK = (SUMY - Y)/199 contains the 200 values of ybar(j). Use the variables YJACK and XJACK to calculate the jackknife estimate of the variance of r = ybar/xbar. 3. How do your variance estimates from 1 and 2 compare with the usual Taylor series-based estimate of the variance of this ratio? #Assignment 6 grading macro #It is assumed that the data file sample.dat is organized #so that the 1st random group is the first 20 observations, etc. data<-matrix(scan("sample.dat"),byrow=T,ncol=12) hval<-data[,3] cprice<-data[,7] if (min(data[,1])<50.5) print("ERROR: This is not a sample of Lockhart City.") ndat<-dim(data)[1] if (ndat!=200) print(c("ERROR: This sample is not of size 200.",ndat)) ngroup<-floor((ndat+.5)/20) if (sum((data[,1]-sort(data[,1]))^2)==0) print("ERROR: The groups have not been randomly selected.") ri<-NULL for (i in 1:ngroup) ri<-c(ri,mean(cprice[(20*i-19):(20*i)])/mean(hval[(20*i-19):(20*i)])) print(c("ri",ri)) rbar<-mean(ri) vhat<-var(ri)/ngroup print(c("random groups: rbar, vhat, se",rbar,vhat,sqrt(vhat))) #jackknife hvaljack<-(sum(hval)-hval)/(ndat-1) cpricejack<-(sum(cprice)-cprice)/(ndat-1) rhatj<-cpricejack/hvaljack rhat<-mean(rhatj) vhat<-(ndat-1)*sum((rhatj-rhat)^2)/ndat print(c("jackknife: rhat, vhat, se",rhat,vhat,sqrt(vhat))) #taylor series approximation rhat<-mean(cprice)/mean(hval) vhat<-(1-ndat/19664)*var(cprice-rhat*hval)/(ndat*(mean(hval)^2)) print(c("taylor series: rhat, vhat, se",rhat,vhat,sqrt(vhat))) *********************assignment 7*************************************** Assignment 7 Double sampling and regression estimators Suppose we do not know the average assessed value of the houses in Lockhart City. The Stephen's County Tax assessor will give the assessed value of a house for a charge of $1. Within the SURVEY program this is accomplished by using an address with a negative district number. A simple random sample of Lockhart City of size 200 would probably visit all 25 districts and cost $3100. Suppose instead we were to take a simple random sample of 650 houses and obtain their assessed values. Then for the same total cost we could survey 150 houses and use a double sampling regression estimate, using house value as the auxiliary variable, of the average price a household is willing to pay for cable TV. Execute this design. Use Sarndal et al formula 9.7.28 to estimate the variance using its simplified form with all the g-ks's set equal to 1. #assignment 7 grading macro #advance sample (size 650) to be in lc.adv.dat #final sample (size 150) to be in lc.final.dat adata<-scan("lc.adv.dat",what=list(0,0,0),flush=T) adata<-cbind(adata[[1]],adata[[2]],adata[[3]]) fdata<-matrix(scan("lc.final.dat"),byrow=T,ncol=12) na<-dim(adata)[1] ns<-dim(fdata)[1] if (min(adata[,1])<50.5) { print("advance sample is not a sample from Lockhart City")} print(c("advance sample size:",na)) print(c("final sample size:",ns)) temp<-c(adata[,1]*10000+adata[,2],fdata[,1]*10000+fdata[,2]) subchk<-duplicated(temp) if (!all.equal(subchk,c(rep(F,na),rep(T,ns)))) { print("final sample is not a subsample from initial sample")} reg<-lsfit(fdata[,3],fdata[,7]) yhat<-reg$coef[1]+reg$coef[2]*adata[,3] e<-reg$residuals e1<-fdata[,7] piak<-na/19664 cpk<-ns/na pikstar<-piak*cpk chat<-sum(yhat/piak)+sum(e/pikstar) chat<-chat/19664 piakl<-matrix(piak*(na-1)/19663,nrow=ns,ncol=ns) cpikl<-matrix(ns*(ns-1)/(na*(na-1)),nrow=ns,ncol=ns) for (i in 1:ns) {piakl[i,i]<-piak cpikl[i,i]<-cpk} piklstar<-piakl*cpikl dela<-piakl-piak*piak cdel<-cpikl-cpk*cpk e1check<-e1/piak edblcheck<-e/pikstar v1<-dela*(e1check%o%e1check)/piklstar v2<-cdel*(edblcheck%o%edblcheck)/cpikl vhat<-(sum(v1)+sum(v2))/(19664^2) se<-sqrt(vhat) print(c("chat,vhat,se",chat,vhat,se)) ************************************************************************ * end of transmission * ************************************************************************