# CTRL + SHIFT + R : keyboard shortcut to create sections like we did here for each TP
# CTRL + enter : execute the curent or selected line(s)
# CTRL + SHIFT + A : reformat the selected line(s)
# CTRL + SHIFT + C : comment the selected line(s)



# The questions for this TP are indicated with #. 
# The lines starting with # are not considered as code and will be skipped when running.
# You can write your code between the commented sections/questions. 
# If needed, don't hesitate to add your own comments to make it clearer for you !



# TP1 ---------------------------------------------------------------------

#' 1. Explore Rstudio interface (the 4 main panes, how to make them visible, terminal, 
#'          the tabs: Files / Plots / Help...)
#' Optional. You can open the `Global options` in `Tools` (bar menu in the top or Rstudio interface)
#'            to change the size of the font in `Appearance` panel.
#' 2. Create your first R script
#' 3. Write your first comment (write the title "## Formation R 2025 ##") 
#' 4. Write  2 + 2 in the console and press Enter to have the result
#' 5. Write  2 + 2 in the script and run it with Ctrl + Enter



# TP2 ---------------------------------------------------------------------

#' 1. Create 4 variables w, x, y, z : 
#'    - w a vector with 6.52, 9.3, 13, and 9.45
#'    - x a fraction (for example 2/3)
#'    - y a real number with at least 3 digits after the decimal point
#'    - z a vector with 8, 21.148, 32.5 and 27
w <- c(6.52, 9.3, 13, 9.45)
x <- 2/3
y <- 42.420
z <- c(8, 21.148, 32.5, 27)


#' 2. Give the result of x + y
x + y

#'    Give the result of x + y and print it with 2 decimals (using the function round) 
round(x + y, digits = 2)


#' 3. Multiply the vector w by 2, and assign this result to an object named v
v <- 2*w


#' 4. Print the second element of vector z
z[2]
#' is equivalent to
print(z[2])


# 5. Give the result of v + z
v + z


#' 6. Create a complete dataframe with 2 columns : `Name` and `First_name`
data.frame(
  Name = c("Chatelain", "Marsac", "Plantec", "Poupelin") ,
  First_name = c("Estelle", "Benjamin", "Oriane", "Clément")
)

#' BEST PRACTICE :  you can select lines of code and use ctrl + shift + A  
#' to format your code and make it more legible 


#' 7. Create a matrix called mat of size 3*3. 
#'    The matrix can have any number elsewhere but it must have 1 on the diagonal. 
#'    HINT : you have to use the `matrix()` function, 
#'    make sure to check the help of this new function by typing `?matrix` or `help(matrix) `
#'    Multiply this matrix by 6
mat <- matrix(c(1, 2, 3, 4, 1, 5, 6, 7, 1), nrow = 3, ncol = 3)
6 * mat


#' 8. Create a very simple plot (`plot()` function) : 
#'    with 2 vectors `x_vector` containing values from 1 to 4, and `y_vector` containing any value you like.
#'    HINT: make sure `x_vector` and `y_vector` have the same number of elements !
abscisse <- 1:4
#' is equivalent to
abscisse <- seq(1, 4)

plot(abscisse, w)


#' 9.  Connect the points on the plot you just created 
#'     HINT : using tab with the cursor in a function will give you possible parameters for the function
plot(abscisse, w, type = 'l')



# TP3 ---------------------------------------------------------------------

#' 1.0 With the help of `getwd()`, locate your working directory.
#'    Then, if necessary, you can set it to the R training folder using `setwd()`.

print(getwd())


#' 1.1 Click on `Import Dataset` in the Environment panel to import the dataset crops_data.csv
#'     This csv dataset is located in the `Data` folder


#' 1.2 Click on the `crops_data` object in the Environment panel, to view this dataset
#'     This is equivalent as using the View() function like this `View(crops_data)`

View(crops_data)


#' 2.1 Now, you can explore this `crops_data` object with the functions `head()`, `dim()`, `summary()` and `anyNA()`
#'    Try to understand what these functions do !

head(crops_data)
dim(crops_data)
summary(crops_data)
anyNA(crops_data)

colSums(is.na(crops_data)) # you can see the number of NA per column


#' 2.2 As you can see, some values are missing in the dataset, they are indicated with a `NA`. 
#'    Use `na.omit()` to create a new object `crops_data_cleared` from `crops_data`, without missing values.
#'    Explore this new dataset to check there are no more NAs. 
#'    Some functions/methods cannot work properly when there are NAs in the data used; knowing how to deal with NAs is very important.

crops_data_cleared <- na.omit(crops_data)
head(crops_data_cleared)
dim(crops_data_cleared)
summary(crops_data_cleared)


#' 2.3 Display the column called region
#'     GOOD TO KNOW : When using $ to access a column, pressing on the tab key will help you auto-complete the column name !
crops_data_cleared$region
# is equivalent to
crops_data_cleared[,"region"]


#' 2.4 Display the columns 3 and 5 of `crops_data_cleared`
crops_data_cleared[, c(3, 5)]


#' 2.5 Display lines 18 to 37 of `crops_data_cleared`
#'     Then display lines 18 to 37 of the columns 1 and 2 of `crops_data_cleared`
crops_data_cleared[18:37,]
crops_data_cleared[18:37, c(1,2)]


#' 2.6 Create a new object `hygro_data` as a subset of `crops_data_cleared` :
#'    use only columns region, irrigation_type, humidity, rainfall_mm and soil_moisture
#'    and only the lines where the region is East Africa (use the function subset())
#'    HINT : there are many ways to do this, you can create an intermediate object `temp_hygro`

temp_hygro <- crops_data_cleared[,c("region","irrigation_type","humidity","rainfall_mm","soil_moisture")]
hygro_data = subset(temp_hygro, region=="East Africa")


#' 2.7 Removing an object can sometimes be useful in R. You can use the rm() function 
#'     Always be careful when removing things !
rm(temp_hygro)


#' 3.1 Import crops_ugly_data.csv. Can you identify bad things to correct in this dataframe ?

crops_ugly_data <- read.csv("./Data/crops_ugly_data.csv")
View(crops_ugly_data)


#' COLUMNS
#' There is an empty column that can be deleted
crops_ugly_data$X.1=NULL
#' The special characters in the columns names of the csv (like spaces or parenthesis)
#' They will become `.` in the imported dataframe
#' The number starting the column name of 3harvest_date : R adds an X when importing
#' Some columns didn't have names and are now named X or X.1
#' We can change the names of the columns with more appropriate names :
colnames(crops_ugly_data) = c("farm_id","crop_type","temperature","pesticide_usage_ml","harvest_date")


#' VALUES
summary(crops_ugly_data)
#' The summary shows that the type of the temperature column is character when it should be numeric
#' For the harvest_date, the type is also character and not date
#' This is due to values that break the unique type 
#' (like 18,54 with a comma when all other decimal values are with a point) 
#' (like 22  april 2025 and 03/08/24)


#' You could correct these problems in your initial csv or in R.
#' For example change the value of 18,54 to 18.54 in the temperature column 
#' Then make sure this column is considered as numeric (as.numeric()). 
crops_ugly_data[5,3]=18.54
crops_ugly_data$temperature = as.numeric(crops_ugly_data$temperature)


#' 4.1 Save the objects crops_data and crops_ugly_data to work on them again another time
#'     You can load them again with load()
save(crops_data, crops_ugly_data, file = "data_crops.RData")             # Save multiple objects in a .RData file

load("data_crops.RData")


#' 4.2 Export your hygro_data object in a csv table
write.csv(hygro_data, "./Data/hygro_data_save.csv")



# TP4 ---------------------------------------------------------------------

#' 1. Upload the `mouse_viral_study.xlsx` data, using the `read_excel` function (you can use "From Excel..." from the button "import dataset)
#' If you look at the code used to import the data, you can see that it needed to load a library

mouse_viral_study <- readxl::read_excel("Data/mouse_viral_study.xlsx")


#' 2. Use the `View()` function to visualize the data set
#'    You can also directly click on the dataset in the Environment

View(mouse_viral_study)


#' 3. Print the head and summary of this dataframe

head(mouse_viral_study)
summary(mouse_viral_study)


#' 4. Download the package `stringr` (if not already installed) and transform the names of columns with the function `str_to_upper()` such that all the names are in uppercase.
#'    To do this, you can load the package with `library()` or used explicit call with `stringr::str_to_upper()`.

colnames(mouse_viral_study) <- stringr::str_to_upper(colnames(mouse_viral_study))
head(mouse_viral_study)


#' 5. Adapt the formats of the columns
#'    You can either import the dataset again or change them in your object after importation

mouse_viral_study_good_format <- read_xlsx("Data/mouse_viral_study.xlsx",
                                           col_types = c("numeric", "numeric", "text"))

mouse_viral_study$MED_1_ML <- as.numeric(mouse_viral_study$MED_1_ML)
mouse_viral_study$MED_2_ML <- as.numeric(mouse_viral_study$MED_2_ML)
mouse_viral_study$`VIRUS PRESENTt` <- as.character(mouse_viral_study$`VIRUS PRESENT`)


#' 6. (Bonus) Install the ISLR package containing several training datasets and load the data `Khan`

install.packages("ISLR")
khan <- ISLR::Khan

#' Gene expression for 63 subjects in the train dataset and 20 subjects in the test dataset



# TP 5 --------------------------------------------------------------------

#' 1. Import the `crops_data.csv` in a `crops_data` variable
crops_data <- read.csv("./Data/crops_data.csv")

#' 2. Create a subset of the dataframe with the columns :"irrigation_type",
#'    "region", "crop_type", "soil_moisture", "rainfall_mm","pesticide_usage_ml" and 
#'    "humidity".
new_data <- crops_data[, c("irrigation_type", "region", "crop_type", "soil_moisture", "rainfall_mm", "pesticide_usage_ml", "humidity")]


#' 3. Use na.omit() to remove NA values
new_data <- na.omit(new_data)


#' 4 Create a histogram to visualize the distribution of soil moisture. 
#'    Then add a vertical line to represent the mean of the distribution and add the curve of density
#'    Finally, use `legend()` function to add legend describing the two lines
hist(new_data$soil_moisture, 
     freq = FALSE,                     
     col="cyan3",               
     breaks = 50,                     
     main = "Distribution of soil moisture",
     xlab = "Soil moisture")               
abline(v = mean(new_data$soil_moisture), col = "red", lwd = 3)          
lines(density(new_data$soil_moisture), col = "purple", lwd = 3)    
legend("topright",
       legend = c(paste("Mean =", round(mean(new_data$soil_moisture), 2)),
                  "Density"),
       col = c("red", "purple"),
       lty = c(1, 1),  
       lwd = 2)     


#' 5 Create a boxplot of the pesticide usage in each region.
#'   For that, use the formula `pesticide_usage_ml ~ region` in the `boxplot()` function
#'   Change colors, title and axes labels

boxplot(pesticide_usage_ml ~ region ,
        data = new_data,
        col =  c("#FF0000", "#CCFF00", "#00FF66", "#0066FF", "#CC00FF"),
        main = "Pesticide usage by region",
        xlab = "Region",
        ylab = "Pesticide usage (ml)",
        las = 1)  


#' 6 Create a scatter plot to represent the `soil_moisture` by `rainfall_mm` and color it depending on the region
#'   Change the general shape of points, using the `pch` parameter

plot(
  new_data$rainfall_mm,
  new_data$soil_moisture,
  main = "Soil moisture by Rainfall",
  xlab = "Soil moisture",
  ylab = "Rainfall (mm)",
  pch = 16,
  col = factor(new_data$region)
)



# TP 6 ---------------------------------------------------------------------

#' 1. Install `tidyverse` package if you haven't already installed it
#'    Load it with `library()`

if(!require("tidyverse")){
  install.packages('tidyverse')
  library(tidyverse)
}


#' 2. Import the "crops_data.csv" in a "crops_data" variable
crops_data <- read.csv("./Data/crops_data.csv")


#' 3. With the help of the select() function,
#'    create a subset of the dataframe with the columns :"irrigation_type",
#'    "region", "crop_type", "soil_moisture", "rainfall_mm","pesticide_usage_ml" and 
#'    "humidity".
#'    At the same time, use filter() to keep only the rows with humidity < 70, 
#'    and create a column named pesticide_per_rain with the mutate() function
#'    HINT : use `%>%` to redirect each output

new_data <- crops_data %>% select(irrigation_type,
                            region,
                            crop_type,
                            soil_moisture,
                            rainfall_mm,
                            humidity,
                            pesticide_usage_ml) %>%
  filter(humidity < 70) %>%
  mutate(pesticide_per_rain = pesticide_usage_ml / rainfall_mm) %>% na.omit()


#' 4.1 Install esquisse package
#'     Either clicking on Packages -> Install, or using install.packages("esquisse")
#'     Then load it using library()
#'     And open it, clicking on Addins -> 'ggplot2' builder


#' 4.2 Do in Esquisse : 
#'     Load new_data
#'     Create a histogram to visualize the distribution of soil moisture
#'     Retrieve the code (bottom right of the esquisse panel)

ggplot(new_data) +
  aes(x = soil_moisture) +
  geom_histogram(bins = 30L, fill = "#91ABDA") +
  theme_minimal()


#' 4.3 Then add a vertical line to represent the mean of the distribution
#'     You can change the color of the line to make it more visible
#'     (You have to do this manually, it's not possible in Esquisse)

ggplot(new_data) +
  aes(x = soil_moisture) +
  geom_histogram(bins = 30L, fill = "#91ABDA") +
  theme_minimal() +
  geom_vline(xintercept = mean(new_data$soil_moisture),
             colour = "red")
  

#' 4.4 Do in Esquisse :
#'     Load new_data
#'     Create a boxplot of the pesticide usage in each region
#'     Color according to the region
#'     Change title : "Pesticide usage by region"
#'     Change xlabel : "Region" ; and ylabel : "Pesticide usage (mL)"
#'     You can change the color palette used - don't print the legend - ...

ggplot(new_data) +
  aes(x = region, y = pesticide_usage_ml, fill = region) +
  geom_boxplot() +
  scale_fill_viridis_d(option = "viridis", direction = 1) +
  labs(
    x = "Region",
    y = "Pesticide usage (mL)",
    title = "Pesticide usage by region",
    fill = "Region"
  ) +
  theme_minimal() +
  theme(legend.position = "none")


#' 4.5 Do in Esquisse :
#'    Load new_data
#'    Create a scatter plot to represent the soil_moisture by rainfall_mm and color it depending on the region
#'    Change title ("Soil moisture according to rainfall, colored by region"), xlab ("Soil moisture") and ylab ("Rainfall (mm)")
#'    Change point size and shape
#'    Change colors for regions
#'    Download the plot and retrieve the code that generates the plot

ggplot(new_data) +
  aes(x = soil_moisture, y = rainfall_mm, colour = region) +
  geom_point(size = 2.3, shape = "triangle") +
  scale_color_manual(
    values = c(
      `Central USA` = "#6D7BF8",
      `East Africa` = "#44D051",
      `North India` = "#E9C861",
      `South India` = "#E85C3C",
      `South USA` = "#FF61C3"
    )
  ) +
  labs(x = "Soil moisture",
       y = "Rainfall (mm)",
       title = "Soil moisture according to rainfall, colored by region",
       fill = "Region") +
  theme_minimal()



# TP 7 --------------------------------------------------------------------

#' 1.1 Create your first Rmd file by clicking File, New file and R markdown...
#' You can give a title to this document and your name as an author, output as html then click on ok
#' 
#' 
#' 1.2 Save your document with the title you want with CTRL + s or by clicking save
#' 
#' Automatically, R studio gives you a template with some basic explanations and necessary things to make it run
#' You can see in the YAML the title, author but also more importantly : the output
#' 
#' 
#' 1.3 Knit your Rmd to create a first version of the html report 
#' You can use the knit button or do to it here in the .R file with rmarkdown::render()

rmarkdown::render('./my_document.Rmd',
                  output_file = 'reports/my_first_report.html',
                  clean = TRUE)


#' In the Rmd, look at the first setup chunk {r setup, include=FALSE}. 
#' It is used to put global options for the whole document, this is done with knitr::opts_chunk$set(echo = TRUE)
#' And means that every chunk will have the echo parameter to TRUE and the code will be displayed in the html (unless specified to hide it)
#' 
#' 
#' Your goal is to create a report where we will display the plots we have already created in this training
#' 
#' 1.4 Copy your code to import the data and display the plots you did in TP7.
#' 
#' 
#' 1.5 Make sure to add titles and texts to describe what you are plotting.
#' 
#' 
#' 1.6 (optional) Understanding chunk options


