# pctfp.R
# This R program draws a graph showing the percentage of positive
# results that are false-positive errors in a scientific field
# plotted against the percentage of research hypotheses that are true
# in the field, based on certain arguably sensible assumptions.
# The program is self-contained and runs under the freeware statistical
# software system R, which can be downloaded from many sites listed at
# the link to CRAN at https://www.r-project.org/
# The figure is drawn on the screen in an R graphics window.
# The figure is also saved in an EMF file, which is a scalable vector
# format that can be imported into Microsoft Word using Insert >
# Pictures. The figure is also saved in a PNG file, which is suitable
# for use on web pages.
# Don Macnaughton, July 2019
# You can understand this program without having the R software on your
# computer and without understanding the R language because the
# operation of the program is explained in the comments.
#********************************************************************
# STEPS TO RUN THE PROGRAM
# If you wish to run this program on a computer, you must perform the
# following five steps:
# 1. Install R on the computer if it's not already installed.
# 2. Install the R packages named in the following library()
# statements on the computer if they're not already installed.
library(devEMF) # used to plot the EMF version of the graph
library(psych) # used to compute some descriptive statistics to
# assist with checking the generated data.
# 3. Open the program in R and modify the following statement to
# specify the file name and a valid path for the EMF file (less
# than 10 kilobytes) on the computer:
EMFfile = "C:/Temp/pctfp.emf"
# 4. Modify the following statement to specify the file name and a
# valid path for the PNG file (also less than 10 kilobytes) on
# the computer. (The image can be cropped using, e.g., Adobe
# Photoshop Elements):
PNGfile = "C:/Temp/pctfp.png"
# 5. Run the program. (In the standard R Windows interface, have the
# focus on the program and select Edit > Run all.)
#********************************************************************
# DATA-GENERATION STEPS
# This description is easiest to understand if you have a copy of
# the graph from the program available for viewing while you read.
# A copy of the graph is at
# https://matstat.com/pctfp.png
# The x- and y-coordinates of a set of horizontally-evenly-spaced
# points on the line on the graph are respectively generated
# in two vectors. We assume that these vectors are each column vectors
# equivalent to columns of values for variables in a standard data
# table. Each vector contains 61 values. R uses these x- and y-
# coordinates to plot the points on the line on the graph. R joins
# adjacent points with short appropriately curving line segments,
# which makes the line appear smooth on the graph.
# First, generate the column vector PctTrue, which specifies the range
# of 61 values on the horizontal axis of the graph (i.e., the x-
# coordinates of the point on the line) as the integers between 0 and
# 60 inclusive.
# The notation 0:60 uses the R colon operator to generate a vector
# containing the sequence of integers from 0 to 60.
PctTrue = 0:60
# The critical p-value used in an area of scientific research
# specifies the fraction of the time that a research project will make
# a false-positive error in cases when the null hypothesis is true
# (assuming that the assumptions underlying the p-value are satisfied).
# The line on the graph will be for the following estimated average
# critical p-value used by journals in the field:
CritP = 0.05
# The power of a statistical test for a given effect size is the
# fraction of the time that an effect of that size will be detected
# if the effect of the specified size exists in the population and
# if the research project were repeated over and over, each time
# using a fresh sample (and if the assumptions underlying the p-value
# are satisfied).
# The line on the graph will be for the following estimated average
# power of the statistical tests performed in the field:
Power = 0.6
# The following operations are operations on column vectors of 61
# values.
# For concreteness, for EACH of the 61 rows in the column vectors we
# consider 1000 research hypotheses in the scientifc field under
# consideration. For each row, we compute from the value of PctTrue
# for the row (a) the number and then (b) the percentage of the
# positive results that reflect false-positive errors and store the
# result (the y-coordinates of the points on the line) in the
# associated row of the column vector PctFalsPos for later plotting.
# We do this under the assumed critical p-value and assumed power
# specified above.
# Note that the following computations are element-by-element
# vector operations, not to be confused with the notationally the
# same but mathematically different matrix operations of matrix
# algebra.
# First, compute the number of the 1000 research hypotheses that are
# true for each of the 61 rows and store the results in the column
# vector nTrueHyp. In the following statement R multiplies each of
# the 61 values in the PctTrue vector by 1000 / 100 and puts the
# the result in corresponding element of the 61–element vector
# nTrueHyp.
nTrueHyp = 1000 * PctTrue / 100
# Compute the number of true hypotheses that will achieve positive
# results for each of the 61 rows.
nTruePos = Power * nTrueHyp # Each of the 61 values in nTrueHyp
# is multiplied by the scalar value
# in Power and the respective results
# are placed in the 61-element vector
# nTruePos.
nFalseHyp = 1000 - nTrueHyp # The scalar value of 1000 is
# converted to a 61-element vector
# with each element equal to 1000 and
# then element-by-element subtraction
# is done.
# Compute the number of false-positive results for each row.
nFalsePos = CritP * nFalseHyp
# Finally, compute the variable for the vertical axis of the graph:
PctFalsPos = 100 * nFalsePos / (nTruePos + nFalsePos)
# This completes the generation of the data for the graph.
#********************************************************************
# DATA-CHECKING STEPS
# Create a data frame holding the various columns of data. This makes
# it easy to examine the data for correctness. Note that CritP and
# Power are scalars, but they're converted to vectors of values that
# are all equal to the scalar value when they're inserted in the data
# frame.
datf = data.frame(PctTrue, CritP, Power, nTrueHyp, nTruePos, nFalseHyp,
nFalsePos, PctFalsPos)
# Check the generated data. First, print the first 15 rows of the
# data frame. In the output, note how the PctTrue column and the
# PctFalsPos column respectively specify the x- and y-coordinates
# of the line on the graph for values on the horizontal axis between
# 0 and 14.
head(datf, n=15)
# Generate descriptive statistics for each of the eight variables in
# the data frame. These statistics help to verify that the program
# is working properly.
describe(datf)
#********************************************************************
# GRAPH PLOTTING STEPS
# Prepare for plotting. First, specify values of some plotting
# parameters in R variables. These values will be used at various
# places below.
AxisCol = "darkgray" # axis color
AxisWid = 2 # axis width
TickWid = 2 # tick width
g2col = "darkgray" # color of the two gray lines
g2thick = 2 # thickness of the two gray lines
# Specify relevant attributes of the curving blue line.
linewid = 2 # line width
linecol = "blue3" # line color
# linecol = "black"
linetype = 1 # solid line
#********************************************************************
# Use a loop to draw the graph three times, first in the EMF file, next
# in the PNG file, and last in the graphics window.
for (pass in 1:3){
# Close any open graphics output device.
graphics.off()
# Open the relevant graphics output device, depending on the pass.
# If this is the first pass, specify the device and file for the
# EMF graph using the earlier-specified filename stored near the
# beginning of the program in the variable EMFfile.
# For emf graph generation by the devEMF package to work properly:
# - custom.lty may be TRUE or FALSE.
# - emfPlus must be TRUE (the default) or the graph is invisible
# in the final PDF out of Microsoft Word.
# - emfPlusFont must be TRUE if the graph has "long" character
# strings.
if (pass == 1) {emf(EMFfile, emfPlusFont=TRUE)}
# If this is the second pass, specify the device and file for the
# PNG graph using the earlier-specified filename stored in the
# variable PNGfile.
if (pass == 2) {png(PNGfile, width=1024, height=768)}
# If this is the third pass, specify the graphics window.
if (pass == 3) {windows()}
# Save the old graphics parameter values and set new values.
oldpar <- par
par(cex=1.2, xpd=NA, fin=c(6.9, 5.8),
mar=c(6, 5, 4, 2)+0.1)
# Set the size of the PNG graph.
if (pass ==2) {par(cex=1.1, fin=c(5.9,5.3) )} # was cex=1.3, fin=c(7,6.3)
# Draw an invisible version of the graph to establish the ranges.
plot(PctTrue,PctFalsPos, type="n",
xlab=" ",
ylab=" ",
axes=FALSE)
# Add the curving line to the graph.
lines(PctTrue, PctFalsPos, type="l", col=linecol,
lty=linetype,lwd=linewid)
# Add the four axes.
xoff=.8
yoff=1
axis(1,pos=-yoff,col=AxisCol,lwd=AxisWid, padj=-0.5)
axis(2,pos=-xoff,las=1,col=AxisCol,lwd=AxisWid,
at=c(0,10,20,30,40,50,60,70,80,90,100), padj=0.5, hadj=.8,
labels=(c("0","10","20","30","40","50","60","70","80","90","100")))
axis(3,labels=FALSE,lwd.ticks=0,pos=100+yoff,col=AxisCol,lwd=AxisWid)
axis(4,labels=FALSE,lwd.ticks=0,pos=60+xoff,col=AxisCol,lwd=AxisWid)
# Draw 8 lines to close the four corners of the axes working clockwise
# from the lower left. These are necessary because the graph is designed
# so that the three lines extend almost to the very edge of the plot
# region, which is non-standard in the R graphics package.
# In hindsight, I think that the following eight statements could have
# been eliminated by using the the xaxs="i" and yaxs="i" plotting
# parameters, which are set with par().
segments(0,-yoff,-xoff,-yoff,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(-xoff,-yoff,-xoff,0,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(-xoff,100,-xoff,100+yoff,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(-xoff,100+yoff,0,100+yoff,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(60,100+yoff,60+xoff,100+yoff,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(60+xoff,100+yoff,60+xoff,100,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(60+xoff,-yoff,60+xoff,100+yoff,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
segments(60+xoff,-yoff,60,-yoff,col=AxisCol,lty=1,lend=1,lwd=AxisWid)
# Manually add the minor tick marks on the horizontal axis.
#for (i in 0:5) {
# for (j in 1:9) {
# segments(j+10*i,0-yoff,j+10*i,-1.-yoff,col=AxisCol,lwd=TickWid)
# }
# }
# Do the same for the minor tick marks on the vertical axis, but not
# on the lower-resolution PNG graph.
if (pass != 2){
for (i in 0:9) {
for (j in 1:9) {
segments(-xoff,j+10*i,-xoff-0.5,j+10*i,col=AxisCol,lwd=TickWid)
}
}
}
# Add the axis labels for the two axes.
text(30,-19, labels="Percentage of studied research hypotheses")
text(30,-27, labels="that are true in the field")
text (-12.85,50, labels="Percentage of positive results in the",srt=90)
text ( -9.00,50, labels="field that are false-positive errors",srt=90)
# Add the two gray lines.
xvalue = 20 # The value for xvalue must be an integer between 0 and
# 60 for the following statement to execute properly.
yvalue = subset(datf, PctTrue==xvalue)$PctFalsPos
segments(xvalue, yvalue, xvalue, 0-yoff, col=g2col, lwd=g2thick, lty=1,lend=1)
segments(xvalue, yvalue, 0-xoff, yvalue, col=g2col, lwd=g2thick, lty=1,lend=1)
# This completes the drawing of the graph.
# Close the graphics device (file) if this is the first or second pass.
if (pass < 3) {dev.off()}
# Reset the parameter values to the original values.
par <- oldpar
# End of the loop to draw the graph three times:
}
# Print the row of the dataframe for the case when PctTrue is equal to
# the value of the variable xvalue, which is set a few lines above for
# use in a post.
subset(datf, PctTrue==xvalue)
subset(datf, PctTrue==60)