#Homework 8, Stat 484
#6/09/22
#Mike Zanotti
################################################################
#Problem 1
#Find (or invent) some data (not from the "Data" directory supplied with EssentialR) and import it into R. (It is not a bad idea to include a commented line with units for each variable in your .txt or.csv file).
my.data1<-read.csv("~/Dropbox/R_Class/EssentialR/Data/BeansData.csv", header = TRUE, sep=",", quote="", dec=".", stringsAsFactors = TRUE ,row.names=NULL,comment.char = "#")
names(my.data1)
## [1] "pot.size" "phos" "P.lev" "rep" "trt" "rt.len" "ShtDM"
## [8] "RtDM"
dim(my.data1)
## [1] 24 8
summary(my.data1)
## pot.size phos P.lev rep trt rt.len
## Min. : 4 Min. : 70.0 H:12 A:6 a:4 Min. :146.2
## 1st Qu.: 4 1st Qu.:105.0 L:12 B:6 b:4 1st Qu.:243.3
## Median : 8 Median :175.0 C:6 c:4 Median :280.4
## Mean : 8 Mean :192.5 D:6 d:4 Mean :301.3
## 3rd Qu.:12 3rd Qu.:210.0 e:4 3rd Qu.:360.3
## Max. :12 Max. :420.0 f:4 Max. :521.7
## ShtDM RtDM
## Min. :0.5130 Min. :0.4712
## 1st Qu.:0.8065 1st Qu.:0.6439
## Median :1.0579 Median :0.7837
## Mean :1.1775 Mean :0.8669
## 3rd Qu.:1.3159 3rd Qu.:0.9789
## Max. :2.7627 Max. :1.7510
str(my.data1)
## 'data.frame': 24 obs. of 8 variables:
## $ pot.size: int 4 4 4 4 4 4 4 4 8 8 ...
## $ phos : int 210 210 210 210 420 420 420 420 105 105 ...
## $ P.lev : Factor w/ 2 levels "H","L": 2 2 2 2 1 1 1 1 2 2 ...
## $ rep : Factor w/ 4 levels "A","B","C","D": 1 2 3 4 1 2 3 4 1 2 ...
## $ trt : Factor w/ 6 levels "a","b","c","d",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ rt.len : num 255 211 266 289 487 ...
## $ ShtDM : num 0.796 0.779 0.98 1.223 2.763 ...
## $ RtDM : num 0.648 0.758 0.7 0.989 1.751 ...
head(my.data1)
## pot.size phos P.lev rep trt rt.len ShtDM RtDM
## 1 4 210 L A a 255.29 0.7962 0.6483
## 2 4 210 L B a 211.42 0.7790 0.7582
## 3 4 210 L C a 265.91 0.9795 0.6995
## 4 4 210 L D a 288.55 1.2228 0.9890
## 5 4 420 H A b 486.60 2.7627 1.7510
## 6 4 420 H B b 286.06 1.0743 0.6536
#a) What did you have to do to "clean it up" so it would read in?
#Answer:
#when i copied & pasted the file path in, I initially received the following error:
#Error: '\U' used without hex digits in character string starting ""C:\U".
#To correct this I essentially changed the (\) character to (/) as the file paths all had (/) in the text examples, after which that addition it read the file in.
#My data initially had the % sign under the yield column and it read in as a Factor w/ 8 levels instead of an int"num". I had to go into the excel file & change the yield column setting from "percentage" to "number". Once I did that it read in the yield as a "num"
#b) Are you satisfied with the console output of summary(yourdata)? Did all the variables import in the way (format) you thought they should? c) Include the output of summary(yourdata) and head(yourdata).
#Yes, once I cleaned up the excel file, I was able to successfully read in the data in the format I was hoping for.
summary(my.data1)
## pot.size phos P.lev rep trt rt.len
## Min. : 4 Min. : 70.0 H:12 A:6 a:4 Min. :146.2
## 1st Qu.: 4 1st Qu.:105.0 L:12 B:6 b:4 1st Qu.:243.3
## Median : 8 Median :175.0 C:6 c:4 Median :280.4
## Mean : 8 Mean :192.5 D:6 d:4 Mean :301.3
## 3rd Qu.:12 3rd Qu.:210.0 e:4 3rd Qu.:360.3
## Max. :12 Max. :420.0 f:4 Max. :521.7
## ShtDM RtDM
## Min. :0.5130 Min. :0.4712
## 1st Qu.:0.8065 1st Qu.:0.6439
## Median :1.0579 Median :0.7837
## Mean :1.1775 Mean :0.8669
## 3rd Qu.:1.3159 3rd Qu.:0.9789
## Max. :2.7627 Max. :1.7510
#Answer:
# ?..run. tank yield.... date
#Min. : 1 Min. :261.0 Min. :0.1000 1/1/2022 : 1
#1st Qu.: 6 1st Qu.:265.0 1st Qu.:0.1100 1/15/2022 : 1
#Median :11 Median :268.0 Median :0.1510 1/30/2022 : 1
#Mean :11 Mean :266.9 Mean :0.1482 10/12/2022: 1
#3rd Qu.:16 3rd Qu.:269.0 3rd Qu.:0.1820 10/27/2022: 1
#Max. :21 Max. :270.0 Max. :0.1920 2/14/2022 : 1
#(Other) :15
head(my.data1)
## pot.size phos P.lev rep trt rt.len ShtDM RtDM
## 1 4 210 L A a 255.29 0.7962 0.6483
## 2 4 210 L B a 211.42 0.7790 0.7582
## 3 4 210 L C a 265.91 0.9795 0.6995
## 4 4 210 L D a 288.55 1.2228 0.9890
## 5 4 420 H A b 486.60 2.7627 1.7510
## 6 4 420 H B b 286.06 1.0743 0.6536
#Answer:
# ?..run. tank yield.... date
#1 1 261 0.192 1/1/2022
#2 2 262 0.110 1/15/2022
#3 3 261 0.108 1/30/2022
#4 4 265 0.101 2/14/2022
#5 5 268 0.173 3/1/2022
#6 6 269 0.109 3/16/2022
################################################################
#Problem 2
#The spreadsheet "StatesData.xls" located in the Data directory in your EssentialR folder contains some (old) data about the 50 US states, and includes a plot with a regression line. Clean this data up and import it into R.
my.data2<-read.csv("~/Dropbox/R_Class/EssentialR/Data/states.csv", header = TRUE, sep=",", quote="", dec=".", stringsAsFactors = TRUE ,comment.char = "#")
names(my.data2)
## [1] "State" "Population" "Income" "Illiteracy" "Life.Exp"
## [6] "Murder" "HS.Grad" "Frost" "Area"
dim(my.data2)
## [1] 50 9
summary(my.data2)
## State Population Income Illiteracy Life.Exp
## Alabama : 1 Min. : 365 Min. :3098 Min. :0.500 Min. :67.96
## Alaska : 1 1st Qu.: 1080 1st Qu.:3993 1st Qu.:0.625 1st Qu.:70.12
## Arizona : 1 Median : 2838 Median :4519 Median :0.950 Median :70.67
## Arkansas : 1 Mean : 4246 Mean :4436 Mean :1.170 Mean :70.88
## California: 1 3rd Qu.: 4968 3rd Qu.:4814 3rd Qu.:1.575 3rd Qu.:71.89
## Colorado : 1 Max. :21198 Max. :6315 Max. :2.800 Max. :73.60
## (Other) :44
## Murder HS.Grad Frost Area
## Min. : 1.400 Min. :37.80 Min. : 0.00 Min. : 1049
## 1st Qu.: 4.350 1st Qu.:48.05 1st Qu.: 66.25 1st Qu.: 36985
## Median : 6.850 Median :53.25 Median :114.50 Median : 54277
## Mean : 7.378 Mean :53.11 Mean :104.46 Mean : 70736
## 3rd Qu.:10.675 3rd Qu.:59.15 3rd Qu.:139.75 3rd Qu.: 81162
## Max. :15.100 Max. :67.30 Max. :188.00 Max. :566432
##
str(my.data2)
## 'data.frame': 50 obs. of 9 variables:
## $ State : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Population: int 3615 365 2212 2110 21198 2541 3100 579 8277 4931 ...
## $ Income : int 3624 6315 4530 3378 5114 4884 5348 4809 4815 4091 ...
## $ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life.Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS.Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : int 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : int 50708 566432 113417 51945 156361 103766 4862 1982 54090 58073 ...
#This is the file path using the current working directory
#"C:/Users/Michael/OneDrive/Dropbox/Penn State University/Courses/Stat 484 R/EssentialR/Data"
#You should be able to fit a regression that mimics the plot in the spreadsheet.
#What is the p-value for the slope in this regression?
model = lm(my.data2$Illiteracy ~ my.data2$Income)
summary(model)
##
## Call:
## lm(formula = my.data2$Illiteracy ~ my.data2$Income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79927 -0.46481 -0.09793 0.34011 1.24378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0932014 0.5765787 5.365 2.3e-06 ***
## my.data2$Income -0.0004336 0.0001288 -3.367 0.00151 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5539 on 48 degrees of freedom
## Multiple R-squared: 0.191, Adjusted R-squared: 0.1742
## F-statistic: 11.34 on 1 and 48 DF, p-value: 0.001505
#Answer: the p-value= 0.001505