El objetivo de los siguientes datos es analizar si el consumo de ciertos alimentos y otros factores (como edad, sexo, etc.) tienen una relación con el nivel de plasma beta-carotene. El interés original de estos datos fue recabar información que pudiera servir para encontrar los factores asociados con niveles bajos de plasma beta-carotene dado que estos podrían estar asociados con el riesgo de desarrollar algunos canceres.
Variables:
Datos=read.table("images/archivos/prdata.dat",
header=FALSE, sep="\t")
names(Datos)=c("AGE", "SEX", "SMOKSTAT", "QUETELET", "VITUSE", "CALORIES", "FAT", "FIBER", "ALCOHOL", "CHOLESTEROL", "BETADIET", "RETDIET", "BETAPLASMA", "RETPLASMA")
head(Datos)
str(Datos)
## 'data.frame': 315 obs. of 14 variables:
## $ AGE : int 64 76 38 40 72 40 65 58 35 55 ...
## $ SEX : int 2 2 2 2 2 2 2 2 2 2 ...
## $ SMOKSTAT : int 2 1 2 2 1 2 1 1 1 2 ...
## $ QUETELET : num 21.5 23.9 20 25.1 21 ...
## $ VITUSE : int 1 1 2 3 1 3 2 1 3 3 ...
## $ CALORIES : num 1299 1032 2372 2450 1952 ...
## $ FAT : num 57 50.1 83.6 97.5 82.6 56 52 63.4 57.8 39.6 ...
## $ FIBER : num 6.3 15.8 19.1 26.5 16.2 9.6 28.7 10.9 20.3 15.5 ...
## $ ALCOHOL : num 0 0 14.1 0.5 0 1.3 0 0 0.6 0 ...
## $ CHOLESTEROL: num 170.3 75.8 257.9 332.6 170.8 ...
## $ BETADIET : int 1945 2653 6321 1061 2863 1729 5371 823 2895 3307 ...
## $ RETDIET : int 890 451 660 864 1209 1439 802 2571 944 493 ...
## $ BETAPLASMA : int 200 124 328 153 92 148 258 64 218 81 ...
## $ RETPLASMA : int 915 727 721 615 799 654 834 825 517 562 ...
Usaremos, por simplicidad, variables binarias.
Datos$SEXFem=1*(Datos$SEX==2)
Datos$VITUSEYes=1*(Datos$VITUSE==1)
Datos$SEXFem=as.factor(Datos$SEXFem)
Datos$VITUSEYes=as.factor(Datos$VITUSEYes)
DatosRed=Datos[,c("BETAPLASMA", "AGE", "SEXFem", "VITUSEYes", "CALORIES", "FAT", "FIBER", "ALCOHOL", "CHOLESTEROL", "BETADIET", "VITUSE")]
summary(DatosRed)
## BETAPLASMA AGE SEXFem VITUSEYes CALORIES FAT
## Min. : 0 Min. :19 0: 42 0:193 Min. : 445 Min. : 14
## 1st Qu.: 90 1st Qu.:39 1:273 1:122 1st Qu.:1338 1st Qu.: 54
## Median : 140 Median :48 Median :1667 Median : 73
## Mean : 190 Mean :50 Mean :1797 Mean : 77
## 3rd Qu.: 230 3rd Qu.:62 3rd Qu.:2100 3rd Qu.: 95
## Max. :1415 Max. :83 Max. :6662 Max. :236
## FIBER ALCOHOL CHOLESTEROL BETADIET VITUSE
## Min. : 3 Min. : 0 Min. : 38 Min. : 214 Min. :1.00
## 1st Qu.: 9 1st Qu.: 0 1st Qu.:155 1st Qu.:1116 1st Qu.:1.00
## Median :12 Median : 0 Median :206 Median :1802 Median :2.00
## Mean :13 Mean : 3 Mean :242 Mean :2186 Mean :1.97
## 3rd Qu.:16 3rd Qu.: 3 3rd Qu.:309 3rd Qu.:2836 3rd Qu.:3.00
## Max. :37 Max. :203 Max. :901 Max. :9642 Max. :3.00
library(GGally)
ggpairs(DatosRed[,c(1,3,4)])
ggpairs(DatosRed[,c(1,2,5:10)])
DatosRed[ DatosRed$ALCOHOL>100, ]
DatosRed[ DatosRed$CALORIES>4000, ]
DatosRed[ DatosRed$BETAPLASMA==0, ]
DatosRed2=DatosRed[-c(62, 257), ]
library(GGally)
ggpairs(DatosRed2[,c(1,3,4)])
ggpairs(DatosRed2[,c(1,2,5:10)])
DatosRed2$BETAPLASMAlog=log(DatosRed2$BETAPLASMA+10)
fit2=lm(BETAPLASMAlog~AGE+SEXFem+VITUSEYes+CALORIES+FAT+FIBER+ALCOHOL+CHOLESTEROL+BETADIET, data=DatosRed2)
summary(fit2)
##
## Call:
## lm(formula = BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES +
## FAT + FIBER + ALCOHOL + CHOLESTEROL + BETADIET, data = DatosRed2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8298 -0.3546 -0.0217 0.3707 2.0218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.27e+00 2.56e-01 16.69 <2e-16 ***
## AGE 5.05e-03 2.80e-03 1.81 0.0721 .
## SEXFem1 2.80e-01 1.20e-01 2.33 0.0206 *
## VITUSEYes1 2.24e-01 7.61e-02 2.94 0.0035 **
## CALORIES -1.69e-04 1.92e-04 -0.88 0.3799
## FAT 4.61e-04 3.03e-03 0.15 0.8792
## FIBER 3.49e-02 1.06e-02 3.30 0.0011 **
## ALCOHOL 8.05e-03 8.21e-03 0.98 0.3277
## CHOLESTEROL -4.18e-04 4.19e-04 -1.00 0.3193
## BETADIET 4.19e-05 2.83e-05 1.48 0.1386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.64 on 303 degrees of freedom
## Multiple R-squared: 0.164, Adjusted R-squared: 0.14
## F-statistic: 6.62 on 9 and 303 DF, p-value: 1.21e-08
drop1(fit2, test = "F")
library(leaps)
subconjuntos<-regsubsets(BETAPLASMAlog~AGE+SEXFem+VITUSEYes+CALORIES+FAT+FIBER+ALCOHOL+CHOLESTEROL+BETADIET, data=DatosRed2, nbest=2)
subconjuntos2=summary(subconjuntos)
combine <- cbind(subconjuntos2$which,subconjuntos2$rsq,subconjuntos2$adjr2,subconjuntos2$cp,subconjuntos2$bic)
ndim=dim(subconjuntos2$which)
dimnames(combine)[[2]][(ndim[2]+1):(ndim[2]+4)]=c("R^2", "R^2_adj", "Cp", "BIC")
round(combine, digits=4)
## (Intercept) AGE SEXFem1 VITUSEYes1 CALORIES FAT FIBER ALCOHOL CHOLESTEROL
## 1 1 0 0 0 0 0 1 0 0
## 1 1 0 0 1 0 0 0 0 0
## 2 1 0 0 0 1 0 1 0 0
## 2 1 0 0 0 0 0 1 0 1
## 3 1 0 0 1 1 0 1 0 0
## 3 1 0 0 1 0 0 1 0 1
## 4 1 0 1 1 1 0 1 0 0
## 4 1 0 0 1 1 0 1 0 0
## 5 1 1 1 1 1 0 1 0 0
## 5 1 1 1 1 0 1 1 0 0
## 6 1 1 1 1 1 0 1 0 0
## 6 1 1 1 1 0 0 1 0 1
## 7 1 1 1 1 1 0 1 0 1
## 7 1 1 1 1 1 0 1 1 0
## 8 1 1 1 1 1 0 1 1 1
## 8 1 1 1 1 0 1 1 1 1
## BETADIET R^2 R^2_adj Cp BIC
## 1 0 0.059 0.056 32.3 -7.4
## 1 0 0.042 0.039 38.3 -2.0
## 2 0 0.101 0.095 19.0 -16.0
## 2 0 0.097 0.091 20.3 -14.8
## 3 0 0.132 0.123 9.9 -21.1
## 3 0 0.126 0.118 11.7 -19.3
## 4 0 0.142 0.130 8.2 -19.1
## 4 1 0.139 0.128 9.2 -18.1
## 5 0 0.152 0.138 6.5 -17.1
## 5 0 0.151 0.137 6.9 -16.7
## 6 1 0.158 0.141 6.3 -13.6
## 6 1 0.158 0.141 6.4 -13.5
## 7 1 0.162 0.142 7.0 -9.2
## 7 1 0.162 0.142 7.0 -9.2
## 8 1 0.164 0.142 8.0 -4.5
## 8 1 0.162 0.140 8.8 -3.7
Backward AIC
##k es la penalización, 2 para AIC, ln(n) para BIC
step(fit2,direction="backward", k = 2)
## Start: AIC=-272
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FAT + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - FAT 1 0.01 123 -274
## - CALORIES 1 0.31 123 -273
## - ALCOHOL 1 0.39 123 -273
## - CHOLESTEROL 1 0.40 124 -273
## <none> 123 -272
## - BETADIET 1 0.90 124 -272
## - AGE 1 1.32 124 -271
## - SEXFem 1 2.20 125 -269
## - VITUSEYes 1 3.51 127 -265
## - FIBER 1 4.43 128 -263
##
## Step: AIC=-274
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - ALCOHOL 1 0.39 123 -275
## - CHOLESTEROL 1 0.39 124 -275
## <none> 123 -274
## - CALORIES 1 0.83 124 -274
## - BETADIET 1 0.90 124 -274
## - AGE 1 1.39 124 -273
## - SEXFem 1 2.19 125 -271
## - VITUSEYes 1 3.51 127 -267
## - FIBER 1 5.36 128 -263
##
## Step: AIC=-275
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - CHOLESTEROL 1 0.53 124 -276
## - CALORIES 1 0.59 124 -276
## <none> 123 -275
## - BETADIET 1 0.99 124 -275
## - AGE 1 1.43 125 -274
## - SEXFem 1 1.91 125 -272
## - VITUSEYes 1 3.28 127 -269
## - FIBER 1 5.01 128 -265
##
## Step: AIC=-276
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## BETADIET
##
## Df Sum of Sq RSS AIC
## <none> 124 -276
## - BETADIET 1 0.88 125 -276
## - AGE 1 1.38 125 -274
## - SEXFem 1 2.33 126 -272
## - CALORIES 1 2.71 127 -271
## - VITUSEYes 1 3.43 127 -269
## - FIBER 1 6.50 130 -262
##
## Call:
## lm(formula = BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES +
## FIBER + BETADIET, data = DatosRed2)
##
## Coefficients:
## (Intercept) AGE SEXFem1 VITUSEYes1 CALORIES FIBER
## 4.26e+00 5.09e-03 2.77e-01 2.18e-01 -1.90e-04 3.54e-02
## BETADIET
## 4.14e-05
summary(step(fit2,direction="backward", k = 2))
## Start: AIC=-272
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FAT + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - FAT 1 0.01 123 -274
## - CALORIES 1 0.31 123 -273
## - ALCOHOL 1 0.39 123 -273
## - CHOLESTEROL 1 0.40 124 -273
## <none> 123 -272
## - BETADIET 1 0.90 124 -272
## - AGE 1 1.32 124 -271
## - SEXFem 1 2.20 125 -269
## - VITUSEYes 1 3.51 127 -265
## - FIBER 1 4.43 128 -263
##
## Step: AIC=-274
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - ALCOHOL 1 0.39 123 -275
## - CHOLESTEROL 1 0.39 124 -275
## <none> 123 -274
## - CALORIES 1 0.83 124 -274
## - BETADIET 1 0.90 124 -274
## - AGE 1 1.39 124 -273
## - SEXFem 1 2.19 125 -271
## - VITUSEYes 1 3.51 127 -267
## - FIBER 1 5.36 128 -263
##
## Step: AIC=-275
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - CHOLESTEROL 1 0.53 124 -276
## - CALORIES 1 0.59 124 -276
## <none> 123 -275
## - BETADIET 1 0.99 124 -275
## - AGE 1 1.43 125 -274
## - SEXFem 1 1.91 125 -272
## - VITUSEYes 1 3.28 127 -269
## - FIBER 1 5.01 128 -265
##
## Step: AIC=-276
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## BETADIET
##
## Df Sum of Sq RSS AIC
## <none> 124 -276
## - BETADIET 1 0.88 125 -276
## - AGE 1 1.38 125 -274
## - SEXFem 1 2.33 126 -272
## - CALORIES 1 2.71 127 -271
## - VITUSEYes 1 3.43 127 -269
## - FIBER 1 6.50 130 -262
##
## Call:
## lm(formula = BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES +
## FIBER + BETADIET, data = DatosRed2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8294 -0.3714 -0.0243 0.3616 2.1089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.26e+00 2.51e-01 16.97 < 2e-16 ***
## AGE 5.09e-03 2.76e-03 1.84 0.0663 .
## SEXFem1 2.77e-01 1.16e-01 2.40 0.0171 *
## VITUSEYes1 2.18e-01 7.49e-02 2.91 0.0039 **
## CALORIES -1.90e-04 7.34e-05 -2.59 0.0101 *
## FIBER 3.54e-02 8.83e-03 4.01 7.8e-05 ***
## BETADIET 4.14e-05 2.81e-05 1.48 0.1411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.64 on 306 degrees of freedom
## Multiple R-squared: 0.158, Adjusted R-squared: 0.142
## F-statistic: 9.57 on 6 and 306 DF, p-value: 1.19e-09
Backward BIC
nobs=dim(DatosRed2)[1]
##k es la penalización, 2 para AIC, ln(n) para BIC
step(fit2,direction="backward", k = log(nobs))
## Start: AIC=-235
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FAT + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - FAT 1 0.01 123 -240
## - CALORIES 1 0.31 123 -240
## - ALCOHOL 1 0.39 123 -240
## - CHOLESTEROL 1 0.40 124 -240
## - BETADIET 1 0.90 124 -238
## - AGE 1 1.32 124 -237
## - SEXFem 1 2.20 125 -235
## <none> 123 -235
## - VITUSEYes 1 3.51 127 -232
## - FIBER 1 4.43 128 -229
##
## Step: AIC=-240
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - ALCOHOL 1 0.39 123 -245
## - CHOLESTEROL 1 0.39 124 -245
## - CALORIES 1 0.83 124 -244
## - BETADIET 1 0.90 124 -244
## - AGE 1 1.39 124 -243
## - SEXFem 1 2.19 125 -241
## <none> 123 -240
## - VITUSEYes 1 3.51 127 -237
## - FIBER 1 5.36 128 -233
##
## Step: AIC=-245
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - CHOLESTEROL 1 0.53 124 -250
## - CALORIES 1 0.59 124 -250
## - BETADIET 1 0.99 124 -248
## - AGE 1 1.43 125 -247
## - SEXFem 1 1.91 125 -246
## <none> 123 -245
## - VITUSEYes 1 3.28 127 -243
## - FIBER 1 5.01 128 -238
##
## Step: AIC=-250
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## BETADIET
##
## Df Sum of Sq RSS AIC
## - BETADIET 1 0.88 125 -253
## - AGE 1 1.38 125 -252
## <none> 124 -250
## - SEXFem 1 2.33 126 -250
## - CALORIES 1 2.71 127 -249
## - VITUSEYes 1 3.43 127 -247
## - FIBER 1 6.50 130 -239
##
## Step: AIC=-253
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER
##
## Df Sum of Sq RSS AIC
## - AGE 1 1.54 126 -255
## <none> 125 -253
## - SEXFem 1 2.48 127 -253
## - CALORIES 1 2.59 128 -252
## - VITUSEYes 1 3.67 128 -250
## - FIBER 1 10.16 135 -234
##
## Step: AIC=-255
## BETAPLASMAlog ~ SEXFem + VITUSEYes + CALORIES + FIBER
##
## Df Sum of Sq RSS AIC
## - SEXFem 1 1.49 128 -257
## <none> 126 -255
## - VITUSEYes 1 4.22 131 -251
## - CALORIES 1 4.69 131 -249
## - FIBER 1 12.39 139 -232
##
## Step: AIC=-257
## BETAPLASMAlog ~ VITUSEYes + CALORIES + FIBER
##
## Df Sum of Sq RSS AIC
## <none> 128 -257
## - VITUSEYes 1 4.51 132 -252
## - CALORIES 1 5.76 134 -249
## - FIBER 1 12.75 141 -233
##
## Call:
## lm(formula = BETAPLASMAlog ~ VITUSEYes + CALORIES + FIBER, data = DatosRed2)
##
## Coefficients:
## (Intercept) VITUSEYes1 CALORIES FIBER
## 4.838785 0.247820 -0.000256 0.044410
summary(step(fit2,direction="backward", k = log(nobs)))
## Start: AIC=-235
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FAT + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - FAT 1 0.01 123 -240
## - CALORIES 1 0.31 123 -240
## - ALCOHOL 1 0.39 123 -240
## - CHOLESTEROL 1 0.40 124 -240
## - BETADIET 1 0.90 124 -238
## - AGE 1 1.32 124 -237
## - SEXFem 1 2.20 125 -235
## <none> 123 -235
## - VITUSEYes 1 3.51 127 -232
## - FIBER 1 4.43 128 -229
##
## Step: AIC=-240
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## ALCOHOL + CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - ALCOHOL 1 0.39 123 -245
## - CHOLESTEROL 1 0.39 124 -245
## - CALORIES 1 0.83 124 -244
## - BETADIET 1 0.90 124 -244
## - AGE 1 1.39 124 -243
## - SEXFem 1 2.19 125 -241
## <none> 123 -240
## - VITUSEYes 1 3.51 127 -237
## - FIBER 1 5.36 128 -233
##
## Step: AIC=-245
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## CHOLESTEROL + BETADIET
##
## Df Sum of Sq RSS AIC
## - CHOLESTEROL 1 0.53 124 -250
## - CALORIES 1 0.59 124 -250
## - BETADIET 1 0.99 124 -248
## - AGE 1 1.43 125 -247
## - SEXFem 1 1.91 125 -246
## <none> 123 -245
## - VITUSEYes 1 3.28 127 -243
## - FIBER 1 5.01 128 -238
##
## Step: AIC=-250
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER +
## BETADIET
##
## Df Sum of Sq RSS AIC
## - BETADIET 1 0.88 125 -253
## - AGE 1 1.38 125 -252
## <none> 124 -250
## - SEXFem 1 2.33 126 -250
## - CALORIES 1 2.71 127 -249
## - VITUSEYes 1 3.43 127 -247
## - FIBER 1 6.50 130 -239
##
## Step: AIC=-253
## BETAPLASMAlog ~ AGE + SEXFem + VITUSEYes + CALORIES + FIBER
##
## Df Sum of Sq RSS AIC
## - AGE 1 1.54 126 -255
## <none> 125 -253
## - SEXFem 1 2.48 127 -253
## - CALORIES 1 2.59 128 -252
## - VITUSEYes 1 3.67 128 -250
## - FIBER 1 10.16 135 -234
##
## Step: AIC=-255
## BETAPLASMAlog ~ SEXFem + VITUSEYes + CALORIES + FIBER
##
## Df Sum of Sq RSS AIC
## - SEXFem 1 1.49 128 -257
## <none> 126 -255
## - VITUSEYes 1 4.22 131 -251
## - CALORIES 1 4.69 131 -249
## - FIBER 1 12.39 139 -232
##
## Step: AIC=-257
## BETAPLASMAlog ~ VITUSEYes + CALORIES + FIBER
##
## Df Sum of Sq RSS AIC
## <none> 128 -257
## - VITUSEYes 1 4.51 132 -252
## - CALORIES 1 5.76 134 -249
## - FIBER 1 12.75 141 -233
##
## Call:
## lm(formula = BETAPLASMAlog ~ VITUSEYes + CALORIES + FIBER, data = DatosRed2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9067 -0.3894 -0.0399 0.3877 2.0871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.84e+00 1.19e-01 40.50 < 2e-16 ***
## VITUSEYes1 2.48e-01 7.51e-02 3.30 0.00107 **
## CALORIES -2.56e-04 6.87e-05 -3.73 0.00023 ***
## FIBER 4.44e-02 8.00e-03 5.55 6.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.64 on 309 degrees of freedom
## Multiple R-squared: 0.131, Adjusted R-squared: 0.123
## F-statistic: 15.6 on 3 and 309 DF, p-value: 1.81e-09