2

I am working with 2 data frames and trying to automate the way I currently do.

ID <- c("ID101","ID102","ID103","ID104","ID105","ID106","ID107","ID108","ID109","ID110")
A <- c(420,440,490,413,446,466,454,433,401,414)
B <- c(230,240,295,253,266,286,254,233,201,214)
C <- c(20,40,90,13,46,66,54,33,61,14)
D <- c(120,140,190,113,146,166,154,133,101,114)
E <- c(38,34,33,56,87,31,12,44,68,91)
F <- c(938,934,973,956,987,931,962,944,918,921)
df1 <- data.frame(ID,A,B,C,D,E,F)

Upstream <- c("A","C","E")
Downstream <- c("B","D","F")
df2 <- data.frame(Upstream,Downstream)

I am currently running a simple linear regression between upstream and downstream data and plot its residuals along with it. The way I do it manually is

fit <- lm(A ~ B, data=df)

lm_eqn <- function(df){
  m <- lm(A ~ B, df);
  eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2* "," ~~ RMSE ~"="~rmse, 
                   list(a = format(coef(m)[1], digits = 2), 
                        b = format(coef(m)[2], digits = 2),
                        r2 = format(summary(m)$r.squared, digits = 3),
                        rmse = round(sqrt(mean(resid(m)^2,na.rm=TRUE)), 3)))
  as.character(as.expression(eq));
}

library(ggplot2)
library(grid)
library(gridExtra)

p1 <- ggplot(df, aes(x=A, y=B)) + geom_point(colour="red",size = 3) + geom_smooth(method=lm) + geom_text(aes(size=10),x = -Inf, hjust = -1, y = Inf, vjust = 1, label = lm_eqn(df), parse = TRUE,show.legend = F)
p2 <-  ggplot(df, aes(x=B, y=resid(fit))) + ylab("Residuals") + geom_point(shape=1,colour="red",size = 3) + geom_smooth(method = "lm")
grid.arrange(p1, p2, ncol=2,top=textGrob("Regression data", 
                                         gp=gpar(cex=1.5, fontface="bold")))

I get this plot

enter image description here

I redo this manually for the next row in df2 which is C & D and then manually change the parameters again for the next row which is E & F.

How can I use functions or automate this logic so that I run only one time and get the 3 plots, one for each (A&B), (C&D), (E&F).

Please let me know if I am not clear on what I want. Ideally I am looking a way to code up so that I don't manually need to enter the values (A,B,C,D,E,F) at the respective places every time I run. Kindly please provide some directions on how to solve this.

2
  • Try to avoid using F as an object name as R uses it False Commented Feb 1, 2016 at 23:16
  • Yes. Thanks for pointing that out. Commented Feb 2, 2016 at 14:09

2 Answers 2

2

You can use apply() on each df2s row, using as.formula() and aes_string():

apply(df2, 1, function(d)
        {

        fit <- lm(as.formula(paste(d["Upstream"], " ~ ", d["Downstream"])), data=df1)

        lm_eqn <- function(df){
                m <- lm(as.formula(paste(d["Upstream"], " ~ ", d["Downstream"])), df);
                eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2* "," ~~ RMSE ~"="~rmse, 
                                 list(a = format(coef(m)[1], digits = 2), 
                                      b = format(coef(m)[2], digits = 2),
                                      r2 = format(summary(m)$r.squared, digits = 3),
                                      rmse = round(sqrt(mean(resid(m)^2,na.rm=TRUE)), 3)))
                as.character(as.expression(eq));
        }

        library(ggplot2)
        library(grid)
        library(gridExtra)

        p1 <- ggplot(df1, aes_string(x=d["Upstream"], y=d["Downstream"])) + geom_point(colour="red",size = 3) + geom_smooth(method=lm) + geom_text(aes(size=10),x = -Inf, hjust = -1, y = Inf, vjust = 1, label = lm_eqn(df1), parse = TRUE,show.legend = FALSE)
        p2 <-  ggplot(df1, aes_string(x=d["Downstream"], y=resid(fit))) + ylab("Residuals") + geom_point(shape=1,colour="red",size = 3) + geom_smooth(method = "lm")
        grid.arrange(p1, p2, ncol=2,top=textGrob("Regression data", 
                                                 gp=gpar(cex=1.5, fontface="bold")))
        })
Sign up to request clarification or add additional context in comments.

2 Comments

This is exactly what I was looking for. Thank you so much HubertL. I have one more small question. How can we save these images one after the other into a pdf file? Could you please help me with that?
To save the plots in a pdf, call pdf("plotList.pdf") just before the apply, and dev.off() right after.
1

Another answer using reshape2 to organize your data and plyr to run the regression and plots on the data subsets:

library(reshape2)

df3 <- cbind(
  #melt(data, id.vars, measure.vars, variable.name, value.name)
  melt(df1, c("ID"), df2$Upstream,   "up",   "Independent.var"),
  melt(df1, c("ID"), df2$Downstream, "down", "Dependent.var")
)
#df3    #Results of the above cbind(melt, melt)
#      ID up value    ID down Dependent.var
#1  ID101  A   420 ID101    B           230
#2  ID102  A   440 ID102    B           240
#3  ID103  A   490 ID103    B           295
#     .    .    .     .     .            .
#28 ID108  E    44 ID108    F           944
#29 ID109  E    68 ID109    F           918
#30 ID110  E    91 ID110    F           921


#Small edit to the labeling function:
lm_eqn <- function(df){
  m <- lm(Dependent.var ~ Independent.var, df);  #This is the only change
  eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2* "," ~~ RMSE ~"="~rmse, 
                   list(a = format(coef(m)[1], digits = 2), 
                        b = format(coef(m)[2], digits = 2),
                        r2 = format(summary(m)$r.squared, digits = 3),
                        rmse = round(sqrt(mean(resid(m)^2,na.rm=TRUE)), 3)))
  as.character(as.expression(eq));
}

# Put your plot code into a function.
plotter <- function(zz) {
  zz$resid <- resid(lm(Dependent.var ~ Independent.var, zz))
  p1 <- ggplot(zz, aes(x= Independent.var, y= Dependent.var)) + 
    geom_point(colour="red",size = 3) + geom_smooth(method=lm) + 
    geom_text( aes(size=10),x = -Inf, hjust = -2, y = Inf, vjust = 1, label = lm_eqn(zz), parse = TRUE,show.legend = FALSE)
  p2 <-  ggplot(zz, aes(x= Dependent.var, y=resid )) + ylab("Residuals") + 
    geom_point(shape=1,colour="red",size = 3) + geom_smooth(method = "lm")
  p3 <- grid.arrange(p1, p2, ncol=2,top=textGrob("Regression data", 
                                               gp=gpar(cex=1.5, fontface="bold")))
  #Choose what you want to output here.  You can output a list: ex. list(fit, graph, etc)
  p3      #Only the last plot is returned in this case
}

library(plyr)

#Run on every subset of data:
#dlply = take input (d)ataframe and output a (l)ist using (ply)r
dlply(df3, .variables = c("up"), .fun = plotter)
  #p3 is output in list format

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.