#Homework 8, Stat 484
#6/09/22
#Mike Zanotti

################################################################
#Problem 1

#Find (or invent) some data (not from the "Data" directory supplied with EssentialR) and import it into R. (It is not a bad idea to include a commented line with units for each variable in your .txt or.csv file). 

my.data1<-read.csv("~/Dropbox/R_Class/EssentialR/Data/BeansData.csv", header = TRUE, sep=",", quote="", dec=".", stringsAsFactors = TRUE ,row.names=NULL,comment.char = "#")
names(my.data1)
## [1] "pot.size" "phos"     "P.lev"    "rep"      "trt"      "rt.len"   "ShtDM"   
## [8] "RtDM"
dim(my.data1)
## [1] 24  8
summary(my.data1)
##     pot.size       phos       P.lev  rep   trt       rt.len     
##  Min.   : 4   Min.   : 70.0   H:12   A:6   a:4   Min.   :146.2  
##  1st Qu.: 4   1st Qu.:105.0   L:12   B:6   b:4   1st Qu.:243.3  
##  Median : 8   Median :175.0          C:6   c:4   Median :280.4  
##  Mean   : 8   Mean   :192.5          D:6   d:4   Mean   :301.3  
##  3rd Qu.:12   3rd Qu.:210.0                e:4   3rd Qu.:360.3  
##  Max.   :12   Max.   :420.0                f:4   Max.   :521.7  
##      ShtDM             RtDM       
##  Min.   :0.5130   Min.   :0.4712  
##  1st Qu.:0.8065   1st Qu.:0.6439  
##  Median :1.0579   Median :0.7837  
##  Mean   :1.1775   Mean   :0.8669  
##  3rd Qu.:1.3159   3rd Qu.:0.9789  
##  Max.   :2.7627   Max.   :1.7510
str(my.data1)
## 'data.frame':    24 obs. of  8 variables:
##  $ pot.size: int  4 4 4 4 4 4 4 4 8 8 ...
##  $ phos    : int  210 210 210 210 420 420 420 420 105 105 ...
##  $ P.lev   : Factor w/ 2 levels "H","L": 2 2 2 2 1 1 1 1 2 2 ...
##  $ rep     : Factor w/ 4 levels "A","B","C","D": 1 2 3 4 1 2 3 4 1 2 ...
##  $ trt     : Factor w/ 6 levels "a","b","c","d",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ rt.len  : num  255 211 266 289 487 ...
##  $ ShtDM   : num  0.796 0.779 0.98 1.223 2.763 ...
##  $ RtDM    : num  0.648 0.758 0.7 0.989 1.751 ...
head(my.data1)
##   pot.size phos P.lev rep trt rt.len  ShtDM   RtDM
## 1        4  210     L   A   a 255.29 0.7962 0.6483
## 2        4  210     L   B   a 211.42 0.7790 0.7582
## 3        4  210     L   C   a 265.91 0.9795 0.6995
## 4        4  210     L   D   a 288.55 1.2228 0.9890
## 5        4  420     H   A   b 486.60 2.7627 1.7510
## 6        4  420     H   B   b 286.06 1.0743 0.6536
#a) What did you have to do to "clean it up" so it would read in? 
#Answer:
#when i copied & pasted the file path in, I initially received the following error:
#Error: '\U' used without hex digits in character string starting ""C:\U".
#To correct this I essentially changed the (\) character to (/) as the file paths all had (/) in the text examples, after which that addition it read the file in. 
#My data initially had the % sign under the yield column and it read in as a Factor w/ 8 levels instead of an int"num". I had to go into the excel file & change the yield column setting from "percentage" to "number". Once I did that it read in the yield as a "num"

#b) Are you satisfied with the console output of summary(yourdata)? Did all the variables import in the way (format) you thought they should? c) Include the output of summary(yourdata) and head(yourdata).
#Yes, once I cleaned up the excel file, I was able to successfully read in the data in the format I was hoping for. 

summary(my.data1)
##     pot.size       phos       P.lev  rep   trt       rt.len     
##  Min.   : 4   Min.   : 70.0   H:12   A:6   a:4   Min.   :146.2  
##  1st Qu.: 4   1st Qu.:105.0   L:12   B:6   b:4   1st Qu.:243.3  
##  Median : 8   Median :175.0          C:6   c:4   Median :280.4  
##  Mean   : 8   Mean   :192.5          D:6   d:4   Mean   :301.3  
##  3rd Qu.:12   3rd Qu.:210.0                e:4   3rd Qu.:360.3  
##  Max.   :12   Max.   :420.0                f:4   Max.   :521.7  
##      ShtDM             RtDM       
##  Min.   :0.5130   Min.   :0.4712  
##  1st Qu.:0.8065   1st Qu.:0.6439  
##  Median :1.0579   Median :0.7837  
##  Mean   :1.1775   Mean   :0.8669  
##  3rd Qu.:1.3159   3rd Qu.:0.9789  
##  Max.   :2.7627   Max.   :1.7510
#Answer:
#    ?..run.        tank         yield....              date   
#Min.   : 1   Min.   :261.0   Min.   :0.1000   1/1/2022  : 1  
#1st Qu.: 6   1st Qu.:265.0   1st Qu.:0.1100   1/15/2022 : 1  
#Median :11   Median :268.0   Median :0.1510   1/30/2022 : 1  
#Mean   :11   Mean   :266.9   Mean   :0.1482   10/12/2022: 1  
#3rd Qu.:16   3rd Qu.:269.0   3rd Qu.:0.1820   10/27/2022: 1  
#Max.   :21   Max.   :270.0   Max.   :0.1920   2/14/2022 : 1  
#(Other)   :15  

head(my.data1)
##   pot.size phos P.lev rep trt rt.len  ShtDM   RtDM
## 1        4  210     L   A   a 255.29 0.7962 0.6483
## 2        4  210     L   B   a 211.42 0.7790 0.7582
## 3        4  210     L   C   a 265.91 0.9795 0.6995
## 4        4  210     L   D   a 288.55 1.2228 0.9890
## 5        4  420     H   A   b 486.60 2.7627 1.7510
## 6        4  420     H   B   b 286.06 1.0743 0.6536
#Answer:
#  ?..run. tank yield....      date
#1       1  261     0.192  1/1/2022
#2       2  262     0.110 1/15/2022
#3       3  261     0.108 1/30/2022
#4       4  265     0.101 2/14/2022
#5       5  268     0.173  3/1/2022
#6       6  269     0.109 3/16/2022



################################################################
#Problem 2

#The spreadsheet "StatesData.xls" located in the Data directory in your EssentialR folder contains some (old) data about the 50 US states, and includes a plot with a regression line. Clean this data up and import it into R. 

my.data2<-read.csv("~/Dropbox/R_Class/EssentialR/Data/states.csv", header = TRUE, sep=",", quote="", dec=".", stringsAsFactors = TRUE ,comment.char = "#")
names(my.data2)
## [1] "State"      "Population" "Income"     "Illiteracy" "Life.Exp"  
## [6] "Murder"     "HS.Grad"    "Frost"      "Area"
dim(my.data2)
## [1] 50  9
summary(my.data2)
##         State      Population        Income       Illiteracy       Life.Exp    
##  Alabama   : 1   Min.   :  365   Min.   :3098   Min.   :0.500   Min.   :67.96  
##  Alaska    : 1   1st Qu.: 1080   1st Qu.:3993   1st Qu.:0.625   1st Qu.:70.12  
##  Arizona   : 1   Median : 2838   Median :4519   Median :0.950   Median :70.67  
##  Arkansas  : 1   Mean   : 4246   Mean   :4436   Mean   :1.170   Mean   :70.88  
##  California: 1   3rd Qu.: 4968   3rd Qu.:4814   3rd Qu.:1.575   3rd Qu.:71.89  
##  Colorado  : 1   Max.   :21198   Max.   :6315   Max.   :2.800   Max.   :73.60  
##  (Other)   :44                                                                 
##      Murder          HS.Grad          Frost             Area       
##  Min.   : 1.400   Min.   :37.80   Min.   :  0.00   Min.   :  1049  
##  1st Qu.: 4.350   1st Qu.:48.05   1st Qu.: 66.25   1st Qu.: 36985  
##  Median : 6.850   Median :53.25   Median :114.50   Median : 54277  
##  Mean   : 7.378   Mean   :53.11   Mean   :104.46   Mean   : 70736  
##  3rd Qu.:10.675   3rd Qu.:59.15   3rd Qu.:139.75   3rd Qu.: 81162  
##  Max.   :15.100   Max.   :67.30   Max.   :188.00   Max.   :566432  
## 
str(my.data2)
## 'data.frame':    50 obs. of  9 variables:
##  $ State     : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Population: int  3615 365 2212 2110 21198 2541 3100 579 8277 4931 ...
##  $ Income    : int  3624 6315 4530 3378 5114 4884 5348 4809 4815 4091 ...
##  $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
##  $ Life.Exp  : num  69 69.3 70.5 70.7 71.7 ...
##  $ Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
##  $ HS.Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
##  $ Frost     : int  20 152 15 65 20 166 139 103 11 60 ...
##  $ Area      : int  50708 566432 113417 51945 156361 103766 4862 1982 54090 58073 ...
#This is the file path using the current working directory
#"C:/Users/Michael/OneDrive/Dropbox/Penn State University/Courses/Stat 484 R/EssentialR/Data"

#You should be able to fit a regression that mimics the plot in the spreadsheet. 
#What is the p-value for the slope in this regression?
model = lm(my.data2$Illiteracy ~ my.data2$Income)
summary(model)
## 
## Call:
## lm(formula = my.data2$Illiteracy ~ my.data2$Income)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79927 -0.46481 -0.09793  0.34011  1.24378 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.0932014  0.5765787   5.365  2.3e-06 ***
## my.data2$Income -0.0004336  0.0001288  -3.367  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5539 on 48 degrees of freedom
## Multiple R-squared:  0.191,  Adjusted R-squared:  0.1742 
## F-statistic: 11.34 on 1 and 48 DF,  p-value: 0.001505
#Answer: the p-value= 0.001505