Packages utilisés
Importation et nettoyage des données
Préparation des données avec interpolation et prédiction
Analyse des données
Classification

Packages utilisés

Importation et nettoyage des données

data <- read.csv("croissance.csv")

# Suppression des NA
data <- na.omit(data)

data$sexe <- as.factor(data$sexe)

# Séparer le dataset par colonne sexe et les mettre dans une liste 
data_list <- split(data, data$sexe)

# Séparer par rapport à ind (identifiant d'individu)
data_list <- lapply(data_list, function(x) split(x, x$ind))

# Trouvé les individus qui ont des tailles decroissantes
individus_decroissants <- lapply(data_list, function(x) {
  lapply(x, function(y) {
    y %>% 
      mutate(taille_diff = c(0, diff(taille))) %>% 
      filter(taille_diff < 0)
  })
})

# Supprimer la valeur aberrante (taille décroisante) en gardant la taille n-1
data_list <- mapply(function(x, y) {
  mapply(function(z, w) {
    if (nrow(w) > 0) {
      z <- z[-nrow(z), ]
    }
    z
  }, x, y, SIMPLIFY = FALSE)
}, data_list, individus_decroissants)

Préparation des données avec interpolation et prédiction

# Fonction pour interpoler les données
data_list_interp <- lapply(data_list, function(x) {
  lapply(x, function(y) {
    age_min <- min(y$age)
    age_max <- max(y$age)
    age_round <- seq(ceiling(age_min), floor(age_max))
    
    # Interpolation PCHIP pour taille et poids
    taille_interp <- pchip(y$age, y$taille, age_round)
    poids_interp <- pchip(y$age, y$poids, age_round)
    
    data.frame(age = age_round, taille = taille_interp, poids = poids_interp)
  })
})

# Prédire la taille et le poids des âges manquant en utilisant les individus semblables ayant des données pour ces âges et la vitesse de croissance

# Fonction pour prédire la taille et le poids des âges manquants
predire_taille_poids <- function(data_list, age_min, age_max) {
  lapply(data_list, function(sex_group) {
    lapply(sex_group, function(ind_data) {
      
      # Trouver les âges manquants
      ages_manquants <- setdiff(seq(age_min, age_max), ind_data$age)
      
      if (length(ages_manquants) > 0) {
        
        # Calcul de la vitesse de croissance (différence de taille et poids)
        croissance <- ind_data %>%
          arrange(age) %>%
          mutate(
            taille_diff = c(0, diff(taille)),
            poids_diff = c(0, diff(poids)),
            age_diff = c(0, diff(age))
          ) %>%
          mutate(
            taille_vitesse = taille_diff / age_diff,
            poids_vitesse = poids_diff / age_diff
          )
        
        # Moyenne des vitesses de croissance sur les intervalles valides
        taille_vitesse_moy <- mean(croissance$taille_vitesse[!is.na(croissance$taille_vitesse) & croissance$age_diff > 0], na.rm = TRUE)
        poids_vitesse_moy <- mean(croissance$poids_vitesse[!is.na(croissance$poids_vitesse) & croissance$age_diff > 0], na.rm = TRUE)
        
        # Prédiction des tailles et poids pour les âges manquants
        pred_data <- data.frame(
          age = ages_manquants,
          taille = NA,
          poids = NA
        )
        
        for (i in seq_along(ages_manquants)) {
          age_m <- ages_manquants[i]
          
          # Trouver les données les plus proches en âge
          closest_before <- ind_data %>% filter(age < age_m) %>% arrange(desc(age)) %>% head(1)
          closest_after <- ind_data %>% filter(age > age_m) %>% arrange(age) %>% head(1)
          
          # Interpolation avec la vitesse de croissance
          if (nrow(closest_before) > 0 && nrow(closest_after) > 0) {
            # Si on a des points de part et d'autre, on interpole
            delta_age_before <- age_m - closest_before$age
            delta_age_after <- closest_after$age - age_m
            
            pred_data$taille[i] <- closest_before$taille + delta_age_before * taille_vitesse_moy
            pred_data$poids[i] <- closest_before$poids + delta_age_before * poids_vitesse_moy
          } else if (nrow(closest_before) > 0) {
            # Sinon on extrapole en utilisant le point précédent
            delta_age <- age_m - closest_before$age
            pred_data$taille[i] <- closest_before$taille + delta_age * taille_vitesse_moy
            pred_data$poids[i] <- closest_before$poids + delta_age * poids_vitesse_moy
          } else if (nrow(closest_after) > 0) {
            # Ou en utilisant le point suivant
            delta_age <- closest_after$age - age_m
            pred_data$taille[i] <- closest_after$taille - delta_age * taille_vitesse_moy
            pred_data$poids[i] <- closest_after$poids - delta_age * poids_vitesse_moy
          }
        }
        
        # Ajouter les prédictions aux données existantes
        ind_data <- bind_rows(ind_data, pred_data) %>%
          arrange(age)
      }
      
      return(ind_data)
    })
  })
}

# Utilisation de la fonction pour prédire les tailles et poids manquants
data_list_interp <- predire_taille_poids(data_list_interp, 1, 18)

# Fonction pour calculer l'IMC en fonction de la taille et du poids
imc_calculate <- function(weight, height) {
  weight / ((height/100)^2)
}

# Ajout de l'IMC dans les données
data_list_interp <- lapply(data_list_interp, function(x) {
  lapply(x, function(y) {
    y$imc <- imc_calculate(y$poids, y$taille)
    y
  })
})

# Fonction pour rajouter interpretation de l'IMC
interpret_imc <- function(imc) {
  if (imc < 16) {
    return("Maigreur sévère")
  } else if (imc < 16.5) {
    return("Maigreur modérée")
  } else if (imc < 18.5) {
    return("Maigreur légère")
  } else if (imc < 25) {
    return("Poids normal")
  } else if (imc < 30) {
    return("Surpoids")
  } else if (imc < 35) {
    return("Obésité modérée")
  } else if (imc < 40) {
    return("Obésité sévère")
  } else {
    return("Obésité morbide")
  }
}

# Ajouter l'interprétation de l'IMC aux données
data_list_interp <- lapply(data_list_interp, function(x) {
  lapply(x, function(y) {
    y$imc_interpretation <- sapply(y$imc, interpret_imc)
    y
  })
})

# Ajouter taux de croissance à chaque âge pour chaque individu
data_list_interp <- lapply(data_list_interp, function(x) {
  lapply(x, function(y) {
    y <- y %>% 
      arrange(age) %>%
      mutate(
        taille_diff = c(0, diff(taille)),
        poids_diff = c(0, diff(poids)),
        age_diff = c(0, diff(age))
      ) %>%
      mutate(
        taille_vitesse = taille_diff / age_diff,
        poids_vitesse = poids_diff / age_diff
      )
    y
  })
})

Analyse des données

# Fonction pour calculer les statistiques de taille et poids
calculer_statistiques <- function(data, colonne_taille = "taille", colonne_poids = "poids", colonne_age = "age") {
  # Calcul de la médiane et des écarts-types pour la taille par groupe d'âge
  statistiques_taille <- data %>%
    bind_rows() %>%
    group_by(!!sym(colonne_age)) %>%
    summarise(
      taille_mediane = median(!!sym(colonne_taille)),  # Médiane (M)
      ecart_type_taille = sd(!!sym(colonne_taille))  # Écart-type (σ)
    ) %>%
    mutate(
      taille_plus_1ecart = taille_mediane + ecart_type_taille,
      taille_moins_1ecart = taille_mediane - ecart_type_taille,
      taille_plus_2ecart = taille_mediane + 2 * ecart_type_taille,
      taille_moins_2ecart = taille_mediane - 2 * ecart_type_taille,
      taille_plus_3ecart = taille_mediane + 3 * ecart_type_taille,
      taille_moins_3ecart = taille_mediane - 3 * ecart_type_taille
    )
  
  # Calcul des percentiles pour le poids par groupe d'âge
  percentiles_poids <- data %>%
    bind_rows() %>%
    group_by(!!sym(colonne_age)) %>%
    summarise(
      p1 = quantile(!!sym(colonne_poids), probs = 0.01),
      p3 = quantile(!!sym(colonne_poids), probs = 0.03),
      p10 = quantile(!!sym(colonne_poids), probs = 0.10),
      p25 = quantile(!!sym(colonne_poids), probs = 0.25),
      p50 = quantile(!!sym(colonne_poids), probs = 0.50),
      p75 = quantile(!!sym(colonne_poids), probs = 0.75),
      p90 = quantile(!!sym(colonne_poids), probs = 0.90),
      p97 = quantile(!!sym(colonne_poids), probs = 0.97),
      p99 = quantile(!!sym(colonne_poids), probs = 0.99)
    )
  
  # Fusionner les résultats pour la taille et le poids par âge
  statistiques_combinees <- left_join(statistiques_taille, percentiles_poids, by = colonne_age)
  
  return(statistiques_combinees)
}

ind_M <- calculer_statistiques(data_list_interp$M)
ind_F <- calculer_statistiques(data_list_interp$F)

# Fonction pour générer les graphiques de taille
generer_graphique_taille <- function(data, sexe) {
  if (sexe == "garçons") {
    limites_taille <- c(50, 220)
  } else {
    limites_taille <- c(50, 190)
  }

  ggplot(data, aes(x = age)) +
    geom_line(aes(y = taille_mediane), size = 1.2, color = "black") +
    geom_line(aes(y = taille_plus_1ecart), linetype = "dashed", color = "black") +
    geom_line(aes(y = taille_moins_1ecart), linetype = "dashed", color = "black") +
    geom_line(aes(y = taille_plus_2ecart), linetype = "dotted", color = "black") +
    geom_line(aes(y = taille_moins_2ecart), linetype = "dotted", color = "black") +
    geom_line(aes(y = taille_plus_3ecart), linetype = "dotdash", color = "black") +
    geom_line(aes(y = taille_moins_3ecart), linetype = "dotdash", color = "black") +
    scale_y_continuous(
      limits = limites_taille,
      breaks = seq(floor(limites_taille[1]), ceiling(limites_taille[2]), 10)
    ) +
    scale_x_continuous(breaks = seq(1, 18, 1)) +
    labs(title = paste("Courbes de taille pour les", sexe, "de 1 à 18 ans"),
         x = "Âge (années)", y = "Taille (cm)") +
    theme_minimal(base_size = 12) +
    theme(panel.grid.major = element_line(size = 0.5, color = "grey80"),
          panel.grid.minor = element_line(size = 0.2, color = "grey90")) +
    # Placer les annotations à droite de la courbe
    annotate("text", x = 18.5, y = tail(data$taille_mediane, 1), label = "M", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$taille_plus_1ecart, 1), label = "+1σ", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$taille_moins_1ecart, 1), label = "-1σ", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$taille_plus_2ecart, 1), label = "+2σ", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$taille_moins_2ecart, 1), label = "-2σ", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$taille_plus_3ecart, 1), label = "+3σ", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$taille_moins_3ecart, 1), label = "-3σ", vjust = 0.5, hjust = 0, size = 3, color = "black")
}

# Fonction pour générer les graphiques de poids
generer_graphique_poids <- function(data, sexe) {
  if (sexe == "garçons") {
    limites_poids <- c(0, 110)
  } else {
    limites_poids <- c(0, 100)
  }

  ggplot(data, aes(x = age)) +
    geom_line(aes(y = p50), size = 1.2, color = "black") +
    geom_line(aes(y = p1), linetype = "dashed", color = "black") +
    geom_line(aes(y = p3), linetype = "dashed", color = "black") +
    geom_line(aes(y = p10), linetype = "dotted", color = "black") +
    geom_line(aes(y = p25), linetype = "dotted", color = "black") +
    geom_line(aes(y = p75), linetype = "dotted", color = "black") +
    geom_line(aes(y = p90), linetype = "dashed", color = "black") +
    geom_line(aes(y = p97), linetype = "dashed", color = "black") +
    geom_line(aes(y = p99), linetype = "dotdash", color = "black") +
    scale_y_continuous(
      limits = limites_poids,
      breaks = seq(floor(limites_poids[1]), ceiling(limites_poids[2]), 10)
    ) +
    scale_x_continuous(breaks = seq(1, 18, 1)) +
    labs(title = paste("Courbes de poids pour les", sexe, "de 1 à 18 ans"),
         x = "Âge (années)", y = "Poids (kg)") +
    theme_minimal(base_size = 12) +
    theme(panel.grid.major = element_line(size = 0.5, color = "grey80"),
          panel.grid.minor = element_line(size = 0.2, color = "grey90")) +
    # Placer les annotations à droite de la courbe
    annotate("text", x = 18.5, y = tail(data$p50, 1), label = "M (50%)", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p1, 1), label = "1%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p3, 1), label = "3%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p10, 1), label = "10%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p25, 1), label = "25%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p75, 1), label = "75%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p90, 1), label = "90%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p97, 1), label = "97%", vjust = 0.5, hjust = 0, size = 3, color = "black") +
    annotate("text", x = 18.5, y = tail(data$p99, 1), label = "99%", vjust = 0.5, hjust = 0, size = 3, color = "black")
}

# Fonction pour générer et sauvegarder les graphiques
generer_graphiques <- function(data, sexe, chemin_sauvegarde) {
  graphique_taille <- generer_graphique_taille(data, sexe)
  ggsave(filename = paste0(chemin_sauvegarde, "graphique_taille_", sexe, ".jpeg"), plot = graphique_taille, dpi = 300)

  graphique_poids <- generer_graphique_poids(data, sexe)
  ggsave(filename = paste0(chemin_sauvegarde, "graphique_poids_", sexe, ".jpeg"), plot = graphique_poids, dpi = 300)

  list(graphique_taille = graphique_taille, graphique_poids = graphique_poids)
}

# Exécution de la fonction
chemin <- "graphiques/"
courbe_M <- generer_graphiques(ind_M, "garçons", chemin)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Saving 7 x 5 in image
## Saving 7 x 5 in image

courbe_F <- generer_graphiques(ind_F, "filles", chemin)

## Saving 7 x 5 in image
## Saving 7 x 5 in image

# Affichage des graphiques
courbe_M$graphique_taille

courbe_M$graphique_poids

courbe_F$graphique_taille

courbe_F$graphique_poids

# Fonction pour superposer les graphiques de taille et de poids pour les garçons et les filles
superposer_graphiques <- function(data_garcons, data_filles, type = "taille") {
  if (type == "taille") {
    limites <- c(50, 210)  # Plage ajustée pour la taille
    y_label <- "Taille (cm)"
    titre <- "Courbes de taille superposées pour garçons et filles"
    breaks_y <- seq(50, 210, 10)  # Étiquettes des axes Y espacées tous les 10 cm
    
    courbe_garcons <- ggplot(data_garcons, aes(x = age)) +
      geom_ribbon(
        aes(ymin = taille_moins_3ecart, ymax = taille_plus_3ecart, fill = "Garçons - Zone ±3σ"),
        alpha = 0.1, color = NA
      ) +
      geom_ribbon(
        aes(ymin = taille_moins_1ecart, ymax = taille_plus_1ecart, fill = "Garçons - Zone ±1σ"),
        alpha = 0.3, color = NA
      ) +
      geom_line(aes(y = taille_mediane, color = "Garçons - Médiane"), size = 1.2) 
    
    courbe_filles <- courbe_garcons +
      geom_ribbon(
        data = data_filles,
        aes(ymin = taille_moins_3ecart, ymax = taille_plus_3ecart, fill = "Filles - Zone ±3σ"),
        alpha = 0.1, color = NA
      ) +
      geom_ribbon(
        data = data_filles,
        aes(ymin = taille_moins_1ecart, ymax = taille_plus_1ecart, fill = "Filles - Zone ±1σ"),
        alpha = 0.3, color = NA
      ) +
      geom_line(data = data_filles, aes(x = age, y = taille_mediane, color = "Filles - Médiane"), size = 1.2)
    
  } else if (type == "poids") {
    limites <- c(0, 110)  # Plage ajustée pour le poids
    y_label <- "Poids (kg)"
    titre <- "Courbes de poids superposées pour garçons et filles"
    breaks_y <- seq(0, 110, 10)  # Étiquettes des axes Y espacées tous les 10 kg
    
    courbe_garcons <- ggplot(data_garcons, aes(x = age)) +
      geom_ribbon(
        aes(ymin = p1, ymax = p99, fill = "Garçons - Zone 1%-99%"),
        alpha = 0.1, color = NA
      ) +
      geom_ribbon(
        aes(ymin = p25, ymax = p75, fill = "Garçons - Zone 25%-75%"),
        alpha = 0.3, color = NA
      ) +
      geom_line(aes(y = p50, color = "Garçons - Médiane (50%)"), size = 1.2)
    
    courbe_filles <- courbe_garcons +
      geom_ribbon(
        data = data_filles,
        aes(ymin = p1, ymax = p99, fill = "Filles - Zone 1%-99%"),
        alpha = 0.1, color = NA
      ) +
      geom_ribbon(
        data = data_filles,
        aes(ymin = p25, ymax = p75, fill = "Filles - Zone 25%-75%"),
        alpha = 0.3, color = NA
      ) +
      geom_line(data = data_filles, aes(x = age, y = p50, color = "Filles - Médiane (50%)"), size = 1.2)
    
  } else {
    stop("Type invalide. Utilisez 'taille' ou 'poids'.")
  }
  
  # Finalisation du graphique
  graphique_superpose <- courbe_filles +
    scale_y_continuous(
      limits = limites,
      breaks = breaks_y,
      expand = c(0, 0)
    ) +
    scale_x_continuous(
      breaks = seq(1, 18, 1),  # Espacement des étiquettes tous les 1 ans
      expand = c(0, 0)
    ) +
    labs(
      title = titre,
      x = "Âge (années)",
      y = y_label
    ) +
    scale_color_manual(
      values = c(
        "Garçons - Médiane" = "blue", "Filles - Médiane" = "red",
        "Garçons - Médiane (50%)" = "blue", "Filles - Médiane (50%)" = "red"
      )
    ) +
    scale_fill_manual(
      values = c(
        "Garçons - Zone ±3σ" = "blue", "Garçons - Zone ±1σ" = "blue",
        "Filles - Zone ±3σ" = "red", "Filles - Zone ±1σ" = "red",
        "Garçons - Zone 1%-99%" = "blue", "Filles - Zone 1%-99%" = "red",
        "Garçons - Zone 25%-75%" = "blue", "Filles - Zone 25%-75%" = "red"
      ),
      guide = guide_legend(
        override.aes = list(shape = 15, size = 4)  # Crée un carré pour la légende
      )
    ) +
    theme_minimal(base_size = 14) +
    theme(
      panel.grid.major = element_line(size = 0.6, color = "grey80"),
      panel.grid.minor = element_line(size = 0.3, color = "grey90"),
      legend.position = "bottom",
      axis.text.x = element_text(size = 12),
      axis.text.y = element_text(size = 12),
      axis.title = element_text(size = 14),
      plot.title = element_text(hjust = 0.5, size = 16)
    )
  
  return(graphique_superpose)
}

# Création et sauvegarde des graphiques superposés
graphique_taille_superpose <- superposer_graphiques(ind_M, ind_F, type = "taille")
ggsave(filename = paste0(chemin, "graphique_taille_superpose.jpeg"), plot = graphique_taille_superpose, dpi = 400)

## Saving 7 x 5 in image

graphique_poids_superpose <- superposer_graphiques(ind_M, ind_F, type = "poids")
ggsave(filename = paste0(chemin, "graphique_poids_superpose.jpeg"), plot = graphique_poids_superpose, dpi = 400)

## Saving 7 x 5 in image

# Affichage des graphiques superposés
print(graphique_taille_superpose)

print(graphique_poids_superpose)

# Créer une base de données avec des intervalles pour les catégories d'IMC
height <- seq(50, 220, by = 0.1)  # Taille en cm
weight <- seq(0, 110, by = 0.1)   # Poids en kg

# Calculer l'IMC pour chaque combinaison de taille et de poids
imc_data <- expand.grid(height = height, weight = weight)
imc_data$imc <- imc_calculate(imc_data$weight, imc_data$height)

# Définir les catégories d'IMC
imc_data$category <- cut(imc_data$imc,
                         breaks = c(-Inf, 16, 18.4, 24.9, 29.9, 34.9, 39.9, Inf),
                         labels = c("Maigreur sévère", "Insuffisance pondérale", "Poids normal", 
                                    "Surpoids", "Obésité modérée", "Obésité sévère", "Obésité morbide"))

# Définir les couleurs pour chaque catégorie
colors <- c("Maigreur sévère" = "lightblue", 
            "Insuffisance pondérale" = "lightgreen", 
            "Poids normal" = "green", 
            "Surpoids" = "yellow", 
            "Obésité modérée" = "orange", 
            "Obésité sévère" = "orangered", 
            "Obésité morbide" = "red")

# Créer le graphique
imc_plot <- ggplot(imc_data, aes(x = height, y = weight, fill = category)) +
  geom_tile() +
  scale_fill_manual(values = colors) +
  scale_x_continuous(breaks = seq(50, 220, by = 10)) +
  scale_y_continuous(breaks = seq(0, 110, by = 10)) +  
  labs(x = "Taille (cm)", y = "Poids (kg)", fill = "Catégorie d'IMC",
       title = "Interprétation de l'indice de masse corporelle") +
  theme_minimal()

# Sauvegarder le graphique
ggsave("graphiques/imc_plot.jpeg", plot = imc_plot, width = 12, height = 6)

# Calculer la moyenne de la vitesse de croissance à chaque age pour chaque sexe
vitesse_croissance_M <- bind_rows(data_list_interp$M, .id = "individu") %>%
  bind_rows() %>%
  group_by(age) %>%
  summarise(
    taille_vitesse_moyenne = mean(taille_vitesse, na.rm = TRUE),
    poids_vitesse_moyenne = mean(poids_vitesse, na.rm = TRUE)
  ) %>%
  ungroup()

vitesse_croissance_F <- bind_rows(data_list_interp$F, .id = "individu") %>%
  bind_rows() %>%
  group_by(age) %>%
  summarise(
    taille_vitesse_moyenne = mean(taille_vitesse, na.rm = TRUE),
    poids_vitesse_moyenne = mean(poids_vitesse, na.rm = TRUE)
  ) %>%
  ungroup()

vitesse_croissance <- list(M = vitesse_croissance_M, F = vitesse_croissance_F)


# Fonction pour générer un tracé superposé des courbes de croissance
generer_courbe_superposee <- function(data_list_interp, sexe = "M") {
  # Vérifier que l'entrée est correcte
  if (!is.list(data_list_interp[[sexe]])) {
    stop("Les données fournies ne sont pas une liste de courbes.")
  }

  # Combiner toutes les données dans un seul data frame
  data_combined <- bind_rows(data_list_interp[[sexe]], .id = "individu")

  # Convertir la colonne "individu" en facteur pour des couleurs distinctes (si souhaité)
  data_combined$individu <- as.factor(data_combined$individu)

  # Tracer les courbes de croissance pour tous les individus
  ggplot(data = data_combined, aes(x = age, y = taille, group = individu)) +
    geom_line(alpha = 0.1, color = "black") +  # Alpha réduit pour rendre les courbes plus visibles
    labs(title = paste("Courbe de croissance (points reliés) pour tous les individus", sexe),
         x = "Âge (années)", y = "Taille (cm)") +
    theme_minimal()
}

generer_courbe_superposee(data_list_interp, sexe = "M")

generer_courbe_superposee(data_list_interp, sexe = "F")

# Verification s’il y a une différence significative entre les filles et les garçons pour la taille et pour le poids, pour chaque age.

# Fonction pour traiter les données et calculer moyennes ou médianes
process_data <- function(data_list, variable, statistic) {
  data <- bind_rows(data_list$M, .id = "individu") %>%
    mutate(sexe = "M") %>%
    bind_rows(bind_rows(data_list$F, .id = "individu") %>%
                mutate(sexe = "F")) %>%
    group_by(age, sexe) %>%
    summarise(!!variable := statistic(!!sym(variable), na.rm = TRUE)) %>%
    ungroup()
  return(data)
}

# Fonction principale d'extraction, de calcul des différences et d'interprétation
extract_and_analyze <- function(data_list, variable, threshold = 5) {
  # Vérifier si la variable est valide
  if (!(variable %in% c("taille", "poids", "taille_vitesse", "poids_vitesse"))) {
    stop("La variable spécifiée n'est pas valide.")
  }
  
  # Appliquer la fonction pour les moyennes et les médianes
  data_grouped_mean <- process_data(data_list, variable, mean)
  data_grouped_median <- process_data(data_list, variable, median)
  
  # Fonction pour transformer les données en format large (par sexe)
  spread_data <- function(data, variable) {
    data %>%
      spread(sexe, !!sym(variable)) %>%
      mutate_at(vars(starts_with("F"), starts_with("M")), list(~ifelse(is.na(.), 0, .)))
  }
  
  # Appliquer la fonction de transformation pour la variable spécifiée
  data_grouped_mean_variable <- spread_data(data_grouped_mean, variable)
  data_grouped_median_variable <- spread_data(data_grouped_median, variable)
  
  # Calculer la différence entre les garçons et les filles
  data_grouped_mean_variable$diff <- data_grouped_mean_variable$M - data_grouped_mean_variable$F
  data_grouped_median_variable$diff <- data_grouped_median_variable$M - data_grouped_median_variable$F
  
  # Transformer cette différence en pourcentage de la variable speicifiée  des filles
  data_grouped_mean_variable$diff_pct <- (data_grouped_mean_variable$diff / data_grouped_mean_variable$F) * 100
  data_grouped_median_variable$diff_pct <- (data_grouped_median_variable$diff / data_grouped_median_variable$F) * 100
  
  # Ajouter une colonne pour déterminer si la différence est significative (> threshold)
  data_grouped_mean_variable$significatif <- ifelse(
    abs(data_grouped_mean_variable$diff_pct) > threshold,
    "Significatif",
    "Non significatif"
  )
  data_grouped_median_variable$significatif <- ifelse(
    abs(data_grouped_median_variable$diff_pct) > threshold,
    "Significatif",
    "Non significatif"
  )
  
  # Créer un tableau avec les résultats pour la variable spécifiée
  if (variable == "taille") {
    result_table <- data.frame(
      Taille_Moyenne = data_grouped_mean_variable$significatif,
      Taille_Médiane = data_grouped_median_variable$significatif,
      row.names = data_grouped_mean_variable$age
    )
  } else if (variable == "poids") {
    result_table <- data.frame(
      Poids_Moyenne = data_grouped_mean_variable$significatif,
      Poids_Médiane = data_grouped_median_variable$significatif,
      row.names = data_grouped_mean_variable$age
    )
  } else if (variable == "taille_vitesse") {
    result_table <- data.frame(taille_vitesse_moyenne = data_grouped_mean_variable$significatif,
                               row.names = data_grouped_mean_variable$age)
  } else {
    result_table <- data.frame(poids_vitesse_moyenne = data_grouped_mean_variable$significatif,
                               row.names = data_grouped_mean_variable$age)
  }
  return(result_table)
}

# Appel à la fonction avec un seuil de 5% pour la taille
result_taille <- extract_and_analyze(data_list_interp, "taille", threshold = 5)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Appel à la fonction avec un seuil de 5% pour le poids
result_poids <- extract_and_analyze(data_list_interp, "poids", threshold = 5)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Appel à la fonction avec un seuil de 5% pour taille_vitesse
result_taille_vitesse <- extract_and_analyze(data_list_interp,"taille_vitesse", threshold = 5)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Appel à la fonction avec un seuil de 5% pour poids_vitesse
result_poids_vitesse <- extract_and_analyze(data_list_interp,"poids_vitesse", threshold = 5)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Regrouper les résultats pour l'affichage
result_combined <- cbind(result_taille, result_poids, result_taille_vitesse, result_poids_vitesse)

# Afficher les résultats
result_combined

##      Taille_Moyenne   Taille_Médiane    Poids_Moyenne    Poids_Médiane
## 1  Non significatif Non significatif     Significatif     Significatif
## 2  Non significatif Non significatif Non significatif Non significatif
## 3  Non significatif Non significatif Non significatif Non significatif
## 4  Non significatif Non significatif Non significatif Non significatif
## 5  Non significatif Non significatif Non significatif Non significatif
## 6  Non significatif Non significatif Non significatif Non significatif
## 7  Non significatif Non significatif Non significatif Non significatif
## 8  Non significatif Non significatif Non significatif Non significatif
## 9  Non significatif Non significatif Non significatif Non significatif
## 10 Non significatif Non significatif Non significatif Non significatif
## 11 Non significatif Non significatif     Significatif Non significatif
## 12 Non significatif Non significatif Non significatif Non significatif
## 13 Non significatif Non significatif Non significatif Non significatif
## 14 Non significatif Non significatif Non significatif Non significatif
## 15 Non significatif Non significatif Non significatif Non significatif
## 16     Significatif     Significatif     Significatif     Significatif
## 17     Significatif     Significatif     Significatif     Significatif
## 18     Significatif     Significatif     Significatif     Significatif
##    taille_vitesse_moyenne poids_vitesse_moyenne
## 1                    <NA>                  <NA>
## 2        Non significatif          Significatif
## 3        Non significatif      Non significatif
## 4            Significatif      Non significatif
## 5            Significatif      Non significatif
## 6        Non significatif      Non significatif
## 7        Non significatif      Non significatif
## 8            Significatif          Significatif
## 9            Significatif      Non significatif
## 10       Non significatif          Significatif
## 11           Significatif          Significatif
## 12       Non significatif      Non significatif
## 13           Significatif          Significatif
## 14           Significatif          Significatif
## 15           Significatif          Significatif
## 16           Significatif          Significatif
## 17           Significatif          Significatif
## 18           Significatif          Significatif

# Appel à la fonction avec un seuil de 3% pour la taille
result_taille_3 <- extract_and_analyze(data_list_interp, "taille", threshold = 3)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Appel à la fonction avec un seuil de 3% pour le poids
result_poids_3 <- extract_and_analyze(data_list_interp, "poids", threshold = 3)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Appel à la fonction avec un seuil de 3% pour taille_vitesse
result_taille_vitesse_3 <- extract_and_analyze(data_list_interp,"taille_vitesse", threshold = 3)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Appel à la fonction avec un seuil de 3% pour poids_vitesse
result_poids_vitesse_3 <- extract_and_analyze(data_list_interp,"poids_vitesse", threshold = 3)

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

# Regrouper les résultats pour l'affichage
result_combined_3 <- cbind(result_taille_3, result_poids_3, result_taille_vitesse_3, result_poids_vitesse_3)

# Afficher les résultats
result_combined_3

##      Taille_Moyenne   Taille_Médiane    Poids_Moyenne    Poids_Médiane
## 1  Non significatif Non significatif     Significatif     Significatif
## 2  Non significatif Non significatif Non significatif Non significatif
## 3  Non significatif Non significatif Non significatif Non significatif
## 4  Non significatif Non significatif Non significatif Non significatif
## 5  Non significatif Non significatif Non significatif Non significatif
## 6  Non significatif Non significatif Non significatif Non significatif
## 7  Non significatif Non significatif Non significatif Non significatif
## 8  Non significatif Non significatif Non significatif Non significatif
## 9  Non significatif Non significatif Non significatif Non significatif
## 10 Non significatif Non significatif Non significatif Non significatif
## 11 Non significatif Non significatif     Significatif     Significatif
## 12 Non significatif Non significatif     Significatif     Significatif
## 13 Non significatif Non significatif Non significatif Non significatif
## 14 Non significatif Non significatif Non significatif Non significatif
## 15     Significatif     Significatif     Significatif     Significatif
## 16     Significatif     Significatif     Significatif     Significatif
## 17     Significatif     Significatif     Significatif     Significatif
## 18     Significatif     Significatif     Significatif     Significatif
##    taille_vitesse_moyenne poids_vitesse_moyenne
## 1                    <NA>                  <NA>
## 2        Non significatif          Significatif
## 3            Significatif          Significatif
## 4            Significatif      Non significatif
## 5            Significatif      Non significatif
## 6        Non significatif      Non significatif
## 7        Non significatif      Non significatif
## 8            Significatif          Significatif
## 9            Significatif      Non significatif
## 10           Significatif          Significatif
## 11           Significatif          Significatif
## 12       Non significatif      Non significatif
## 13           Significatif          Significatif
## 14           Significatif          Significatif
## 15           Significatif          Significatif
## 16           Significatif          Significatif
## 17           Significatif          Significatif
## 18           Significatif          Significatif

# Fonction pour extraire les âges où la différence est significative
extract_significant_ages <- function(result_table) {
  # Extraire les âges où la différence est significative
  ages_taille <- rownames(result_table[result_table$Taille_Moyenne == "Significatif" | result_table$Taille_Médiane == "Significatif", ])
  ages_poids <- rownames(result_table[result_table$Poids_Moyenne == "Significatif" | result_table$Poids_Médiane == "Significatif", ])
  ages_vitesse_taille <- rownames(result_table[result_table$taille_vitesse_moyenne == "Significatif", ])
  ages_vitesse_poids <- rownames(result_table[result_table$poids_vitesse_moyenne == "Significatif", ])
  
  # Retourner les âges où la différence est significative pour la taille et le poids
  return(list(ages_taille = ages_taille, ages_poids = ages_poids, ages_vitesse_taille = ages_vitesse_taille, ages_vitesse_poids = ages_vitesse_poids))
}

# Appel à la fonction pour extraire les âges où la différence est significative
significant_ages <- extract_significant_ages(result_combined)

# Afficher les âges où la différence est significative pour la taille
significant_ages$ages_taille

## [1] "16" "17" "18"

# Afficher les âges où la différence est significative pour le poids
significant_ages$ages_poids

## [1] "1"  "11" "16" "17" "18"

# Afficher les âges où la différence est significative pour la vitesse de croissance de la taille
significant_ages$ages_vitesse_taille

##  [1] "NA" "4"  "5"  "8"  "9"  "11" "13" "14" "15" "16" "17" "18"

# Afficher les âges où la différence est significative pour la vitesse de croissance du poids
significant_ages$ages_vitesse_poids

##  [1] "NA" "2"  "8"  "10" "11" "13" "14" "15" "16" "17" "18"

# Appel à la fonction pour extraire les âges où la différence est significative avec un seuil de 3%
significant_ages_3 <- extract_significant_ages(result_combined_3)

# Afficher les âges où la différence est significative pour la taille avec un seuil de 3%
significant_ages_3$ages_taille

## [1] "15" "16" "17" "18"

# Afficher les âges où la différence est significative pour le poids avec un seuil de 3%
significant_ages_3$ages_poids

## [1] "1"  "11" "12" "15" "16" "17" "18"

# Afficher les âges où la différence est significative pour la vitesse de croissance de la taille avec un seuil de 3%
significant_ages_3$ages_vitesse_taille

##  [1] "NA" "3"  "4"  "5"  "8"  "9"  "10" "11" "13" "14" "15" "16" "17" "18"

# Afficher les âges où la différence est significative pour la vitesse de croissance du poids avec un seuil de 3%
significant_ages_3$ages_vitesse_poids

##  [1] "NA" "2"  "3"  "8"  "10" "11" "13" "14" "15" "16" "17" "18"

# Graphique Significatif seuil 5%
result_long <- result_combined %>%
  mutate(Âge = 1:nrow(result_combined)) %>%
  gather(key = "Variable", value = "Signification", -Âge)

ggplot(result_long, aes(x = Variable, y = Âge, fill = Signification)) +
  geom_tile(color = "grey", size = 0.3) + # Ajouter les bordures des cellules
  scale_fill_manual(values = c("Non significatif" = "red", "Significatif" = "green")) +
  labs(title = "Différences Significatives entre Garçons et Filles (seuil 5%)",
       x = "Variable", y = "Âge", fill = "Signification") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    axis.text.y = element_text(size = 8), 
    panel.grid = element_blank(),        
    axis.title = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5)
  ) + 
  scale_y_continuous(breaks = seq(min(result_long$Âge), max(result_long$Âge), by = 1))

# Graphique Significatif seuil 3%
result_long_3 <- result_combined_3 %>%
  mutate(Âge = 1:nrow(result_combined_3)) %>%
  gather(key = "Variable", value = "Signification", -Âge)

ggplot(result_long_3, aes(x = Variable, y = Âge, fill = Signification)) +
  geom_tile(color = "grey", size = 0.3) + 
  scale_fill_manual(values = c("Non significatif" = "red", "Significatif" = "green")) +
  labs(title = "Différences Significatives entre Garçons et Filles (seuil 3%)",
       x = "Variable", y = "Âge", fill = "Signification") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(size = 8),
    panel.grid = element_blank(), 
    axis.title = element_text(size = 12), 
    plot.title = element_text(hjust = 0.5)
  ) +
  scale_y_continuous(breaks = seq(min(result_long$Âge), max(result_long$Âge), by = 1))

# Calculer la repartition à chaque age des imc dans les données pour chaque sexe 
imc_data <- bind_rows(data_list_interp$M, .id = "individu") %>%
  bind_rows() %>%
  mutate(sexe = "M") %>%
  bind_rows(bind_rows(data_list_interp$F, .id = "individu") %>%
              mutate(sexe = "F")) %>%
  group_by(age, sexe) %>%
  summarise(imc = imc_calculate(poids, taille)) %>%
  ungroup()

## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `summarise()` has grouped output by 'age', 'sexe'. You can override using the
## `.groups` argument.

# Créer un graphique a chaque age pour la répartition pour chaque sexe 
imc_plot <- ggplot(imc_data, aes(x = imc, fill = sexe)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~age, scales = "free") +
  labs(title = "Répartition de l'IMC par âge et par sexe",
       x = "IMC", y = "Densité") +
  theme_minimal()

# Sauvegarder le graphique
ggsave("graphiques/imc_plot.jpeg", plot = imc_plot, width = 12, height = 6)

# Afficher le graphique
print(imc_plot)

Classification

Préparation des données pour la classification

# Regouper les données pour chaque sexe
data_M <- bind_rows(data_list_interp$M, .id = "individu") %>%
  mutate(sexe = "M")
data_F <- bind_rows(data_list_interp$F, .id = "individu") %>%
  mutate(sexe = "F")

# Ne garder que l'individu, l'age, le poids et la taille
data_M <- data_M[,c("individu","age", "poids", "taille", "sexe")]
data_F <- data_F[,c("individu","age", "poids", "taille", "sexe")]

# Rajouter une colonne pour le sexe (M ou F)
data_M$sexe <- "M"
data_F$sexe <- "F"

# Fusionner les données pour les garçons et les filles
data_combined_class <- bind_rows(data_M, data_F)

# Séparer les données par ages, ne garder que 16, 17 et 18 ans car se sont les ages où il y a une différence significative entre les sexes
data_16 <- data_combined_class[data_combined_class$age == 16, c("individu","poids", "taille", "sexe")]
data_17 <- data_combined_class[data_combined_class$age == 17, c("individu","poids", "taille", "sexe")]
data_18 <- data_combined_class[data_combined_class$age == 18, c("individu","poids", "taille", "sexe")]

data_16 <- as.data.frame(data_16)
row.names(data_16) <- data_16$individu
data_16<- data_16[,-1]
data_16$sexe <- as.factor(data_16$sexe)
data_17 <- as.data.frame(data_17)
row.names(data_17) <- data_17$individu
data_17<- data_17[,-1]
data_17$sexe <- as.factor(data_17$sexe)
data_18 <- as.data.frame(data_18)
row.names(data_18) <- data_18$individu
data_18<- data_18[,-1]
data_18$sexe <- as.factor(data_18$sexe)

# Standardize data
standardize_data <- function(data, colonne_X) {
  res_scale <- scale(data[, colonne_X])
  means <- attr(res_scale, "scaled:center")
  sds <- attr(res_scale, "scaled:scale")
  data[, colonne_X] <- res_scale
  return(list(
    data = data,
    means = means,
    sds = sds
  ))
}

# Standardiser les données
data_16 <- standardize_data(data_16, c("poids", "taille"))$data
data_17 <- standardize_data(data_17, c("poids", "taille"))$data
data_18 <- standardize_data(data_18, c("poids", "taille"))$data

Fonctions pour la classification

# KNN
knn_model <- function(data, colonne_X, colonne_z, k = 3) {
  X_train <- data[, colonne_X]
  z_train <- data[[colonne_z]]
  res_knn <- knn(X_train, X_train, z_train, k = k)
  return(res_knn)
}

knn_new <- function(data, new_data, colonne_X, colonne_z, k = 3) {
  X_train <- data[, colonne_X]
  z_train <- data[[colonne_z]]
  X_new <- new_data[, colonne_X]
  res_knn <- knn(X_train, X_new, z_train, k = k)
  return(res_knn)
}

# rpart
rpart_model <- function(data, colonne_X, colonne_z) {
  res_rpart <- rpart(as.formula(paste(colonne_z, "~ .")), data)
  fancyRpartPlot(res_rpart)
  res_predict <- predict(res_rpart, type = "class")
  return(res_predict)
}

rpart_new <- function(data, new_data, colonne_X, colonne_z) {
  res_rpart <- rpart(as.formula(paste(colonne_z, "~ .")), data)
  fancyRpartPlot(res_rpart)
  res_predict <- predict(res_rpart, newdata = new_data, type = "class")
  return(res_predict)
}

# glm
glm_model <- function(data, colonne_X, colonne_z) {
  res_glm <- glm(as.formula(paste(colonne_z, "~ .")), data, family = binomial(link = "logit"))
  hat_proba <- predict(res_glm, type = "response")
  z_predicted <- ifelse(hat_proba < 0.5, levels(data[[colonne_z]])[1], levels(data[[colonne_z]])[2])
  return(z_predicted)
}

glm_new <- function(data, new_data, colonne_X, colonne_z) {
  res_glm <- glm(as.formula(paste(colonne_z, "~ .")), data , family = binomial(link = "logit"))
  hat_proba <- predict(res_glm, newdata = new_data, type = "response")
  z_predicted <- ifelse(hat_proba < 0.5, levels(data[[colonne_z]])[1], levels(data[[colonne_z]])[2])
  return(z_predicted)
}

# lda
lda_model <- function(data, colonne_X, colonne_z) {
  res_lda <- lda(as.formula(paste(colonne_z, "~ .")), data)
  res_predict <- predict(res_lda)
  z_predicted <- res_predict$class
  return(z_predicted)
}

lda_new <- function(data, new_data, colonne_X, colonne_z) {
  res_lda <- lda(as.formula(paste(colonne_z, "~ .")), data)
  res_predict <- predict(res_lda, newdata = new_data)
  z_predicted <- res_predict$class
  return(z_predicted)
}

# qda
qda_model <- function(data, colonne_X, colonne_z) {
  res_qda <- qda(as.formula(paste(colonne_z, "~ .")), data)
  res_predict <- predict(res_qda)
  z_predicted <- res_predict$class
  return(z_predicted)
}

qda_new <- function(data, new_data, colonne_X, colonne_z) {
  res_qda <- qda(as.formula(paste(colonne_z, "~ .")), data)
  res_predict <- predict(res_qda, newdata = new_data)
  z_predicted <- res_predict$class
  return(z_predicted)
}

# Bayes
naiveBayes_model <- function(data, colonne_X, colonne_z) {
  res_naiveBayes <- naiveBayes(as.formula(paste(colonne_z, "~ .")), data)
  z_predicted <- predict(res_naiveBayes, newdata = data)
  return(z_predicted)
}

naiveBayes_new <- function(data, new_data, colonne_X, colonne_z) {
  res_naiveBayes <- naiveBayes(as.formula(paste(colonne_z, "~ .")), data)
  z_predicted <- predict(res_naiveBayes, newdata = new_data)
  return(z_predicted)
}


# Indicateurs de qualité
VP <- function(data_groupe, prediction_groupe, positif) {
  sum(data_groupe == positif & prediction_groupe == positif)
}

VN <- function(data_groupe, prediction_groupe, positif) {
  sum(data_groupe != positif & prediction_groupe != positif)
}

FP <- function(data_groupe, prediction_groupe, positif) {
  sum(data_groupe != positif & prediction_groupe == positif)
}

FN <- function(data_groupe, prediction_groupe, positif) {
  sum(data_groupe == positif & prediction_groupe != positif)
}

precision <- function(data_groupe, prediction_groupe, positif) {
  VP_count <- VP(data_groupe, prediction_groupe, positif)
  FP_count <- FP(data_groupe, prediction_groupe, positif)
  
  if (VP_count + FP_count == 0)
    return(NA) # éviter la division par zéro
  VP_count / (VP_count + FP_count)
}

sensibilite <- function(data_groupe, prediction_groupe, positif) {
  VP_count <- VP(data_groupe, prediction_groupe, positif)
  FN_count <- FN(data_groupe, prediction_groupe, positif)
  
  if (VP_count + FN_count == 0)
    return(NA) # éviter la division par zéro
  VP_count / (VP_count + FN_count)
}

specificite <- function(data_groupe, prediction_groupe, positif) {
  VN_count <- VN(data_groupe, prediction_groupe, positif)
  FP_count <- FP(data_groupe, prediction_groupe, positif)
  
  if (VN_count + FP_count == 0)
    return(NA) # éviter la division par zéro
  VN_count / (VN_count + FP_count)
}

# Exactitude
exactitude <- function(data, z_predicted) {
  mean(as.character(data[[colonne_z]]) == as.character(z_predicted))
}

# Matrice de confusion
matrice_confusion <- function(data, z_predicted) {
  table(data[[colonne_z]], z_predicted)
}

matrice_confusion_freq <- function(data, z_predicted) {
  prop.table(table(data[[colonne_z]], z_predicted))
}

# Entropie croisée
CrossEntropy <- function(data,
                         methode,
                         colonne_X,
                         colonne_z,
                         new_data) {
  if (methode == "naiveBayes") {
    res_naiveBayes <- naiveBayes(as.formula(paste(colonne_z, "~ .")), data)
    z_predicted <- predict(res_naiveBayes, newdata = new_data)
    res <- res_naiveBayes
  }
  if (methode == "lda") {
    res_lda <- lda(as.formula(paste(colonne_z, "~ .")), data)
    z_predicted <- predict(res_lda, newdata = new_data)
    res <- res_lda
  }
  if (methode == "qda") {
    res_qda <- qda(as.formula(paste(colonne_z, "~ .")), data)
    z_predicted <- predict(res_qda, newdata = new_data)
    res <- res_qda
  }
  if (methode == "glm") {
    res_glm <- glm(as.formula(paste(colonne_z, "~ .")), data, family = binomial(link = "logit"))
    z_predicted <- predict(res_glm, newdata = new_data)
    res <- res_glm
  }
  if (methode == "rpart") {
    res_rpart <- rpart(as.formula(paste(colonne_z, "~ .")), data)
    z_predicted <- predict(res_rpart, newdata = new_data)
    res <- res_rpart
  }
  if (methode == "knn") {
    res_knn <- knn(data, new_data, colonne_X, colonne_z, k = 3)
    z_predicted <- res_knn
    res <- NULL # KNN ne retourne pas un modèle
  }
  H <- as.matrix(one_hot(as.data.frame(data[[colonne_z]])))
  P <- as.matrix(predict(res, newdata = new_data, type = "raw"))
  EC <- -sum(H * log(P + 1e-10)) # Ajout d'une petite constante pour éviter log(0)
  return(EC)
}

exactitude_rpart <- function(data_train,
                             data_test,
                             colonnes_X,
                             colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_rpart <- rpart(formule, data = data_train)
  # Extraire les prédictions
  res_predict <- predict(res_rpart, newdata = data_test)
  z_new_predicted <-
    apply(res_predict, 1, function(v)
      as.factor(names(which.max(v))))
  # Calculer l'indicateur de qualité
  exactitude <- mean(as.character(data_test[, colonne_z]) == as.character(z_new_predicted))
  # Retourner le résultat
  return(exactitude)
}

exactitude_knn <- function(data_train,
                           data_test,
                           colonnes_X,
                           colonne_z) {
  # Ajuster le classifieur
  res_knn <- knn(data_train[, colonnes_X], data_test[, colonnes_X], data_train[, colonne_z], k =
                   3)
  # Calculer l'indicateur de qualité
  exactitude <- mean(as.character(data_test[, colonne_z]) == as.character(res_knn))
  # Retourner le résultat
  return(exactitude)
}

exactitude_lda <- function(data_train,
                           data_test,
                           colonnes_X,
                           colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_lda <- lda(formule, data = data_train)
  # Extraire les prédictions
  res_predict <- predict(res_lda, newdata = data_test)
  z_new_predicted <- res_predict$class
  # Calculer l'indicateur de qualité
  exactitude <- mean(as.character(data_test[, colonne_z]) == as.character(z_new_predicted))
  # Retourner le résultat
  return(exactitude)
}

exactitude_qda <- function(data_train,
                           data_test,
                           colonnes_X,
                           colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_qda <- qda(formule, data = data_train)
  # Extraire les prédictions
  res_predict <- predict(res_qda, newdata = data_test)
  z_new_predicted <- res_predict$class
  # Calculer l'indicateur de qualité
  exactitude <- mean(as.character(data_test[, colonne_z]) == as.character(z_new_predicted))
  # Retourner le résultat
  return(exactitude)
}

exactitude_glm <- function(data_train,
                           data_test,
                           colonnes_X,
                           colonne_z) {
  # Package nécessaire
  library(MASS)
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_glm <- glm(formule, data = data_train, family = binomial(link = "logit"))
  # Extraire les prédictions
  res_predict <- predict(res_glm, newdata = data_test, type = "response")
  z_new_predicted <- ifelse(res_predict < 0.5,
                            levels(data_train[, colonne_z])[1],
                            levels(data_train[, colonne_z])[2])
  # Calculer l'indicateur de qualité
  exactitude <- mean(as.character(data_test[, colonne_z]) == as.character(z_new_predicted))
  # Retourner le résultat
  return(exactitude)
}

exactitude_naiveBayes <- function(data_train,
                                  data_test,
                                  colonnes_X,
                                  colonne_z) {
  # Ajuster le classifieur
  res_naiveBayes <- naiveBayes(as.formula(paste(
    colonne_z, "~", paste0(colonnes_X, collapse = "+")
  )), data = data_train)
  # Extraire les prédictions
  z_new_predicted <- predict(res_naiveBayes, newdata = data_test)
  # Calculer l'indicateur de qualité
  exactitude <- mean(as.character(data_test[, colonne_z]) == as.character(z_new_predicted))
  # Retourner le résultat
  return(exactitude)
}

# Fonction de validation croisée simple
simple_VC <- function(data, colonne_X, colonne_z, methode) {
  # Taille de l'échantillon
  n <- nrow(data)
  # Taille de l'échantillon d'apprentissage
  n_train <- floor(n * 0.8)
  # Déterminer de manière aléatoire les données de l'échantillon d'apprentissage
  index_train <- sample(n, n_train)
  # Séparation de l'échantillon
  data_train <- data[index_train, ]
  data_test <- data[-index_train, ]
  if (methode == "naiveBayes") {
    exactitude_cv <- exactitude_naiveBayes(data_train, data_test, colonne_X, colonne_z)
  }
  if (methode == "lda") {
    exactitude_cv <- exactitude_lda(data_train, data_test, colonne_X, colonne_z)
  }
  if (methode == "qda") {
    exactitude_cv <- exactitude_qda(data_train, data_test, colonne_X, colonne_z)
  }
  if (methode == "glm") {
    exactitude_cv <- exactitude_glm(data_train, data_test, colonne_X, colonne_z)
  }
  if (methode == "rpart") {
    exactitude_cv <- exactitude_rpart(data_train, data_test, colonne_X, colonne_z)
  }
  if (methode == "knn") {
    exactitude_cv <- exactitude_knn(data_train, data_test, colonne_X, colonne_z)
  }
  return(exactitude_cv)
}

# VC loo
loo_VC <- function(data, colonne_X, colonne_z, methode) {
  # Taille de l'échantillon
  n <- nrow(data)
  # Définir un objet qui contiendra les résultats intermédiaires
  res_cv_loo <- rep(NA, n)
  # Boucler sur les données à isoler successivement
  for (i in 1:n) {
    # Isoler la ième données dans l'échantillon de test
    data_train_loo <- data[-i, ]
    data_test_loo <- data[i, ]
    if (methode == "naiveBayes") {
      res_cv_loo[i] <- exactitude_naiveBayes(data_train_loo, data_test_loo, colonne_X, colonne_z)
    }
    if (methode == "lda") {
      res_cv_loo[i] <- exactitude_lda(data_train_loo, data_test_loo, colonne_X, colonne_z)
    }
    if (methode == "qda") {
      res_cv_loo[i] <- exactitude_qda(data_train_loo, data_test_loo, colonne_X, colonne_z)
    }
    if (methode == "glm") {
      res_cv_loo[i] <- exactitude_glm(data_train_loo, data_test_loo, colonne_X, colonne_z)
    }
    if (methode == "rpart") {
      res_cv_loo[i] <- exactitude_rpart(data_train_loo, data_test_loo, colonne_X, colonne_z)
    }
    if (methode == "knn") {
      res_cv_loo[i] <- exactitude_knn(data_train_loo, data_test_loo, colonne_X, colonne_z)
    }
  }
  exactitude_cv_loo <- mean(res_cv_loo)
  return(exactitude_cv_loo)
}


# CV kfold
kfold_VC <- function(data, colonne_X, colonne_z, k, methode) {
  # Taille de l'échantillon
  n <- nrow(data)
  # Définir les compartiments de données
  folds <- sample(rep(1:k, each = n / k))
  # Définir un objet qui contiendra les résultats intermédiaires
  res_kfold <- rep(NA, k)
  # Boucler sur les dossiers de données à isoler successivement
  for (fold in 1:k) {
    # Isoler le dossier de données dans l'échantillon de test
    index_test <- which(folds == fold)
    data_train_kfold <- data[-index_test, ]
    data_test_kfold <- data[index_test, ]
    if (methode == "naiveBayes") {
      res_kfold[fold] <- exactitude_naiveBayes(data_train_kfold, data_test_kfold, colonne_X, colonne_z)
    }
    if (methode == "lda") {
      res_kfold[fold] <- exactitude_lda(data_train_kfold, data_test_kfold, colonne_X, colonne_z)
    }
    if (methode == "qda") {
      res_kfold[fold] <- exactitude_qda(data_train_kfold, data_test_kfold, colonne_X, colonne_z)
    }
    if (methode == "glm") {
      res_kfold[fold] <- exactitude_glm(data_train_kfold, data_test_kfold, colonne_X, colonne_z)
    }
    if (methode == "rpart") {
      res_kfold[fold] <- exactitude_rpart(data_train_kfold, data_test_kfold, colonne_X, colonne_z)
    }
    if (methode == "knn") {
      res_kfold[fold] <- exactitude_knn(data_train_kfold, data_test_kfold, colonne_X, colonne_z)
    }
  }
  exactitude_cv_kfold <- mean(res_kfold)
  return(exactitude_cv_kfold)
}

# CV kfold répété
repeated_kfold_VC <- function(data,
                              colonne_X,
                              colonne_z,
                              k,
                              N_rep,
                              methode) {
  # Définir un objet qui contiendra des résultats intermédiaires
  res_kfold <- rep(NA, N_rep)
  # Boucler sur le nombre de répétitions
  for (i in 1:N_rep) {
    # Définir les compartiments de données
    folds <- sample(rep(1:k, each = n / k))
    # Définir un objet qui contiendra des résultats intermédiaires
    res_kfold_tmp <- rep(NA, k)
    # Boucler sur les dossiers de données à isoler successivement
    for (fold in 1:k) {
      # Isoler le dossier de données dans l'échantillon de test
      index_test <- which(folds == fold)
      data_train_kfold <- data[-index_test, ]
      data_test_kfold <- data[index_test, ]
      if (methode == "naiveBayes") {
        res_kfold_tmp[fold] <- exactitude_naiveBayes(data_train_kfold,
                                                     data_test_kfold,
                                                     colonne_X,
                                                     colonne_z)
      }
      if (methode == "lda") {
        res_kfold_tmp[fold] <- exactitude_lda(data_train_kfold,
                                              data_test_kfold,
                                              colonne_X,
                                              colonne_z)
      }
      if (methode == "qda") {
        res_kfold_tmp[fold] <- exactitude_qda(data_train_kfold,
                                              data_test_kfold,
                                              colonne_X,
                                              colonne_z)
      }
      if (methode == "glm") {
        res_kfold_tmp[fold] <- exactitude_glm(data_train_kfold,
                                              data_test_kfold,
                                              colonne_X,
                                              colonne_z)
      }
      if (methode == "rpart") {
        res_kfold_tmp[fold] <- exactitude_rpart(data_train_kfold,
                                                data_test_kfold,
                                                colonne_X,
                                                colonne_z)
      }
      if (methode == "knn") {
        res_kfold_tmp[fold] <- exactitude_knn(data_train_kfold,
                                              data_test_kfold,
                                              colonne_X,
                                              colonne_z)
      }
    }
    # Calculer le résultat pour cette répétition
    res_kfold[i] <- mean(res_kfold_tmp)
  }
  exactitude_cv_kfold <- mean(res_kfold)
  return(exactitude_cv_kfold)
}

# Bootstrap
prediction_rpart <- function(data_train,
                             new_data,
                             colonnes_X,
                             colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_rpart <- rpart(formule, data = data_train)
  # Calculer les prédictions
  res_predict <- predict(res_rpart, newdata = new_data)
  res <- apply(res_predict, 1, function(v)
    as.factor(names(which.max(v))))
  # Formater les prédictions
  res <- as.character(res)
  # Retourner le résultat
  return(res)
}

prediction_knn <- function(data_train,
                           new_data,
                           colonnes_X,
                           colonne_z) {
  # Ajuster le classifieur
  res_knn <- knn(data_train[, colonnes_X], new_data[, colonnes_X], data_train[, colonne_z], k =
                   3)
  # Formater les prédictions
  res <- as.character(res_knn)
  # Retourner le résultat
  return(res)
}

prediction_lda <- function(data_train,
                           new_data,
                           colonnes_X,
                           colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_lda <- lda(formule, data = data_train)
  # Calculer les prédictions
  res_predict <- predict(res_lda, newdata = new_data)
  res <- res_predict$class
  # Formater les prédictions
  res <- as.character(res)
  # Retourner le résultat
  return(res)
}

prediction_qda <- function(data_train,
                           new_data,
                           colonnes_X,
                           colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_qda <- qda(formule, data = data_train)
  # Calculer les prédictions
  res_predict <- predict(res_qda, newdata = new_data)
  res <- res_predict$class
  # Formater les prédictions
  res <- as.character(res)
  # Retourner le résultat
  return(res)
}

prediction_glm <- function(data_train,
                           new_data,
                           colonnes_X,
                           colonne_z) {
  # Définir la formule d'ajustement
  formule <- as.formula(paste(colonne_z, "~", paste0(colonnes_X, collapse = "+")))
  # Ajuster le classifieur
  res_glm <- glm(formule, data = data_train, family = binomial(link = "logit"))
  # Calculer les prédictions
  res_predict <- predict(res_glm, newdata = new_data, type = "response")
  res <- ifelse(res_predict < 0.5,
                levels(data_train[, colonne_z])[1],
                levels(data_train[, colonne_z])[2])
  # Formater les prédictions
  res <- as.character(res)
  # Retourner le résultat
  return(res)
}

prediction_naiveBayes <- function(data_train,
                                  new_data,
                                  colonnes_X,
                                  colonne_z) {
  # Ajuster le classifieur
  res_naiveBayes <- naiveBayes(as.formula(paste(
    colonne_z, "~", paste0(colonnes_X, collapse = "+")
  )), data = data_train)
  # Calculer les prédictions
  res <- predict(res_naiveBayes, newdata = new_data)
  # Formater les prédictions
  res <- as.character(res)
  # Retourner le résultat
  return(res)
}


bootstrap <- function(data,
                      X_new,
                      colonnes_X,
                      colonne_z,
                      methode ,
                      b_rep) {
  # Taille de l'échantillon
  n <- nrow(data)
  # Taille souhaitée pour les échantillons bootstrap
  nb <- n
  # Définir un objet qui contiendra les résultats intermédiaires
  res_tmp <- matrix("NA", ncol = b_rep, nrow = nrow(X_new))
  # Répéter le bootstrap
  for (b in 1:b_rep) {
    # Rééchantillonner (avec remise)
    data_bootstrap <- data[sample(n, nb, replace = T), ]
    # Calculer le résultat sur l'échantillon bootstrap
    if (methode == "naiveBayes") {
      res_tmp[, b] <- prediction_naiveBayes(data_bootstrap, X_new, colonnes_X, colonne_z)
    }
    if (methode == "lda") {
      res_tmp[, b] <- prediction_lda(data_bootstrap, X_new, colonnes_X, colonne_z)
    }
    if (methode == "qda") {
      res_tmp[, b] <- prediction_qda(data_bootstrap, X_new, colonnes_X, colonne_z)
    }
    if (methode == "glm") {
      res_tmp[, b] <- prediction_glm(data_bootstrap, X_new, colonnes_X, colonne_z)
    }
    if (methode == "rpart") {
      res_tmp[, b] <- prediction_rpart(data_bootstrap, X_new, colonnes_X, colonne_z)
    }
    if (methode == "knn") {
      res_tmp[, b] <- prediction_knn(data_bootstrap, X_new, colonnes_X, colonne_z)
    }
  }
  # Calculer la fréquence des résultats obtenus
  res_bootstrap <- NULL
  res_tmp <- apply(res_tmp, 2, as.character)
  for (i in 1:nrow(res_tmp)) {
    res_tmp_i <- factor(res_tmp[i, ], levels = levels(data[, colonne_z]))
    res_bootstrap <- rbind(res_bootstrap, prop.table(table(res_tmp_i)) * 100)
  }
  # Retourner le résultat
  return(res_bootstrap)
}

Application de la classification

select_best_method <- function(data, colonne_X, colonne_z, k = 5, methods = c("naiveBayes", "lda", "qda", "glm", "rpart", "knn")) {
  # Initialiser une liste pour stocker les résultats
  results <- data.frame(method = character(), accuracy = numeric(), stringsAsFactors = FALSE)
  
  # Boucler sur chaque méthode
  for (method in methods) {
    # Effectuer une validation croisée k-fold pour chaque méthode
    accuracy <- kfold_VC(data, colonne_X, colonne_z, k, method)
    
    # Ajouter les résultats au tableau
    results <- rbind(results, data.frame(method = method, accuracy = accuracy))
  }
  
  # Identifier la méthode avec la meilleure exactitude
  best_method <- results[which.max(results$accuracy), ]
  
  # Retourner les résultats et la meilleure méthode
  return(list(best_method = best_method, all_results = results))
}

# Utilisation de la fonction pour data_16, data_17, data_18
resultats_16 <- select_best_method(data_16, colonne_X = c("taille", "poids"), colonne_z = "sexe", k = 5)
resultats_17 <- select_best_method(data_17, colonne_X = c("taille", "poids"), colonne_z = "sexe", k = 5)
resultats_18 <- select_best_method(data_18, colonne_X = c("taille", "poids"), colonne_z = "sexe", k = 5)

# Faire un data frame pour les résultats
resultats <- data.frame(method = c("naiveBayes", "lda", "qda", "glm", "rpart", "knn"), stringsAsFactors = FALSE)
resultats$accuracy16 <- resultats_16$all_results$accuracy
resultats$accuracy17 <- resultats_17$all_results$accuracy
resultats$accuracy18 <- resultats_18$all_results$accuracy
resultats$accuracy_mean <- rowMeans(resultats[, c("accuracy16", "accuracy17", "accuracy18")])
resultats <- resultats[, c("method", "accuracy_mean", "accuracy16", "accuracy17", "accuracy18")]
resultats <- resultats[order(-resultats$accuracy_mean), ]

# Afficher les résultats
resultats

##       method accuracy_mean accuracy16 accuracy17 accuracy18
## 4        glm     0.8925926  0.8991285  0.9153595  0.8632898
## 3        qda     0.8899782  0.8969499  0.9128540  0.8601307
## 2        lda     0.8870733  0.8905229  0.9113290  0.8593682
## 5      rpart     0.8822803  0.8838780  0.9069717  0.8559913
## 6        knn     0.8786129  0.8839869  0.9080610  0.8437908
## 1 naiveBayes     0.8111474  0.7929194  0.8346405  0.8058824

prediction_16 <- data_16
prediction_17 <- data_17
prediction_18 <- data_18
# GLM
# 16 ans
prediction_16$res_glm <- glm_model(data_16, colonne_X = c("taille", "poids"), colonne_z = "sexe")
# 17 ans
prediction_17$res_glm <- glm_model(data_17, colonne_X = c("taille", "poids"), colonne_z = "sexe")
# 18 ans
prediction_18$res_glm <- glm_model(data_18, colonne_X = c("taille", "poids"), colonne_z = "sexe")

# LDA
# 16 ans
prediction_16$res_lda <- lda_model(data_16, colonne_X = c("taille", "poids"), colonne_z = "sexe")
# 17 ans
prediction_17$res_lda <- lda_model(data_17, colonne_X = c("taille", "poids"), colonne_z = "sexe")
# 18 ans
prediction_18$res_lda <- lda_model(data_18, colonne_X = c("taille", "poids"), colonne_z = "sexe")

# Rpart
# 16 ans
prediction_16$res_rpart <- rpart_model(data_16, colonne_X = c("taille", "poids"), colonne_z = "sexe")

# 17 ans
prediction_17$res_rpart <- rpart_model(data_17, colonne_X = c("taille", "poids"), colonne_z = "sexe")

# 18 ans
prediction_18$res_rpart <- rpart_model(data_18, colonne_X = c("taille", "poids"), colonne_z = "sexe")

# KNN
# 16 ans
prediction_16$res_knn <- knn_model(data_16, colonne_X = c("taille", "poids"), colonne_z = "sexe")
# 17 ans
prediction_17$res_knn <- knn_model(data_17, colonne_X = c("taille", "poids"), colonne_z = "sexe")
# 18 ans
prediction_18$res_knn <- knn_model(data_18, colonne_X = c("taille", "poids"), colonne_z = "sexe")


# Comptabiliser la prediction la plus fréquente pour chaque ligne
prediction_16$prediction <- apply(prediction_16[, c("res_glm", "res_lda", "res_rpart", "res_knn")], 1, function(x) {
  names(sort(table(x), decreasing = TRUE)[1])
})

prediction_17$prediction <- apply(prediction_17[, c("res_glm", "res_lda", "res_rpart", "res_knn")], 1, function(x) {
  names(sort(table(x), decreasing = TRUE)[1])
})

prediction_18$prediction <- apply(prediction_18[, c("res_glm", "res_lda", "res_rpart", "res_knn")], 1, function(x) {
  names(sort(table(x), decreasing = TRUE)[1])
})

# Extraire les lignes où la prédiction est incorrecte
prediction_16_incorrect <- prediction_16[prediction_16$sexe != prediction_16$prediction, ]
prediction_17_incorrect <- prediction_17[prediction_17$sexe != prediction_17$prediction, ]
prediction_18_incorrect <- prediction_18[prediction_18$sexe != prediction_18$prediction, ]

# Remmettre la colonne individu
prediction_16_incorrect$individu <- rownames(prediction_16_incorrect)
prediction_17_incorrect$individu <- rownames(prediction_17_incorrect)
prediction_18_incorrect$individu <- rownames(prediction_18_incorrect)

# Faire un data frame pour afficher les individus avec des prédictions incorrectes en ne gardant que l'individu, le sexe et la prediction
prediction_16_incorrect <- prediction_16_incorrect[, c("individu","sexe", "prediction")]
prediction_17_incorrect <- prediction_17_incorrect[, c("individu","sexe", "prediction")]
prediction_18_incorrect <- prediction_18_incorrect[, c("individu","sexe", "prediction")]

# Ne garder que ceux qui sont dans les trois
prediction_incorrect <- merge(prediction_16_incorrect, prediction_17_incorrect, by = "individu")[,(1:3)]

prediction_incorrect <- merge(prediction_incorrect, prediction_18_incorrect, by = "individu")[,(1:3)]

colnames(prediction_incorrect) <- c("individu", "sexe", "prédiction")

# Séparer les individus par sexe
prediction_incorrect_f <- prediction_incorrect[prediction_incorrect$sexe == "F",]
prediction_incorrect_m <- prediction_incorrect[prediction_incorrect$sexe == "M",]

# Afficher les individus avec des prédictions incorrectes
prediction_incorrect_f

##     individu sexe prédiction
## 4   ind_1031    F          M
## 6   ind_1083    F          M
## 16  ind_1287    F          M
## 18  ind_1293    F          M
## 33  ind_1520    F          M
## 37  ind_1594    F          M
## 48  ind_1737    F          M
## 57   ind_185    F          M
## 66  ind_1959    F          M
## 68  ind_1969    F          M
## 70  ind_1999    F          M
## 71  ind_2027    F          M
## 74  ind_2123    F          M
## 77  ind_2207    F          M
## 97  ind_2486    F          M
## 98  ind_2519    F          M
## 103 ind_2612    F          M
## 104 ind_2622    F          M
## 120 ind_2981    F          M
## 128 ind_3115    F          M
## 130 ind_3138    F          M
## 131 ind_3147    F          M
## 132 ind_3191    F          M
## 135 ind_3248    F          M
## 136 ind_3250    F          M
## 138 ind_3279    F          M
## 139 ind_3319    F          M
## 142  ind_334    F          M
## 145 ind_3384    F          M
## 156 ind_3557    F          M
## 157 ind_3560    F          M
## 158 ind_3605    F          M
## 160 ind_3621    F          M
## 163 ind_3631    F          M
## 166 ind_3683    F          M
## 175 ind_3845    F          M
## 181 ind_3970    F          M
## 183 ind_4018    F          M
## 185 ind_4042    F          M
## 189 ind_4181    F          M
## 198 ind_4281    F          M
## 200 ind_4293    F          M
## 217 ind_4450    F          M
## 223 ind_4534    F          M
## 225 ind_4591    F          M
## 230 ind_4668    F          M
## 231 ind_4722    F          M
## 232 ind_4776    F          M
## 244 ind_4933    F          M
## 247  ind_497    F          M
## 248 ind_4996    F          M
## 250 ind_5007    F          M
## 253 ind_5025    F          M
## 255 ind_5091    F          M
## 256  ind_510    F          M
## 257 ind_5116    F          M
## 259 ind_5130    F          M
## 264 ind_5205    F          M
## 265 ind_5242    F          M
## 279 ind_5504    F          M
## 280 ind_5508    F          M
## 281 ind_5528    F          M
## 284 ind_5581    F          M
## 290 ind_5674    F          M
## 293 ind_5772    F          M
## 294 ind_5830    F          M
## 295 ind_5841    F          M
## 300 ind_5956    F          M
## 303 ind_5997    F          M
## 304 ind_5998    F          M
## 305 ind_6004    F          M
## 307 ind_6036    F          M
## 308 ind_6087    F          M
## 309 ind_6097    F          M
## 314   ind_62    F          M
## 320  ind_626    F          M
## 325 ind_6374    F          M
## 328 ind_6395    F          M
## 332 ind_6443    F          M
## 334 ind_6454    F          M
## 339 ind_6524    F          M
## 340 ind_6547    F          M
## 342 ind_6575    F          M
## 344 ind_6677    F          M
## 345 ind_6692    F          M
## 350 ind_6877    F          M
## 353 ind_6975    F          M
## 357 ind_6992    F          M
## 362 ind_7087    F          M
## 363 ind_7102    F          M
## 366 ind_7190    F          M
## 371 ind_7298    F          M
## 374 ind_7346    F          M
## 377 ind_7434    F          M
## 378 ind_7444    F          M
## 382 ind_7467    F          M
## 385 ind_7489    F          M
## 405 ind_7983    F          M
## 407   ind_80    F          M
## 412 ind_8109    F          M
## 413 ind_8112    F          M
## 417  ind_813    F          M
## 423 ind_8319    F          M
## 427 ind_8382    F          M
## 428 ind_8386    F          M
## 430 ind_8403    F          M
## 432 ind_8472    F          M
## 435 ind_8552    F          M
## 441 ind_8682    F          M
## 449 ind_8799    F          M
## 450 ind_8830    F          M
## 451 ind_8835    F          M
## 452 ind_8848    F          M
## 457 ind_8915    F          M
## 460 ind_8928    F          M
## 461 ind_8967    F          M
## 462  ind_898    F          M
## 465 ind_9051    F          M
## 466 ind_9106    F          M
## 468 ind_9162    F          M
## 470 ind_9165    F          M

prediction_incorrect_m

##     individu sexe prédiction
## 1     ind_10    M          F
## 2   ind_1023    M          F
## 3   ind_1024    M          F
## 5   ind_1038    M          F
## 7   ind_1085    M          F
## 8   ind_1086    M          F
## 9   ind_1092    M          F
## 10  ind_1094    M          F
## 11  ind_1145    M          F
## 12  ind_1151    M          F
## 13  ind_1161    M          F
## 14  ind_1197    M          F
## 15  ind_1228    M          F
## 17  ind_1291    M          F
## 19  ind_1306    M          F
## 20  ind_1314    M          F
## 21   ind_133    M          F
## 22  ind_1331    M          F
## 23  ind_1335    M          F
## 24  ind_1386    M          F
## 25  ind_1405    M          F
## 26  ind_1421    M          F
## 27  ind_1434    M          F
## 28  ind_1445    M          F
## 29  ind_1479    M          F
## 30  ind_1481    M          F
## 31  ind_1507    M          F
## 32  ind_1511    M          F
## 34  ind_1551    M          F
## 35  ind_1566    M          F
## 36  ind_1576    M          F
## 38  ind_1606    M          F
## 39  ind_1609    M          F
## 40  ind_1621    M          F
## 41  ind_1638    M          F
## 42  ind_1657    M          F
## 43  ind_1660    M          F
## 44  ind_1673    M          F
## 45  ind_1680    M          F
## 46  ind_1697    M          F
## 47   ind_170    M          F
## 49  ind_1748    M          F
## 50  ind_1763    M          F
## 51  ind_1781    M          F
## 52  ind_1806    M          F
## 53   ind_181    M          F
## 54  ind_1815    M          F
## 55  ind_1847    M          F
## 56  ind_1848    M          F
## 58  ind_1896    M          F
## 59  ind_1899    M          F
## 60   ind_190    M          F
## 61  ind_1906    M          F
## 62  ind_1918    M          F
## 63  ind_1923    M          F
## 64  ind_1939    M          F
## 65   ind_194    M          F
## 67  ind_1965    M          F
## 69  ind_1995    M          F
## 72  ind_2090    M          F
## 73   ind_212    M          F
## 75  ind_2134    M          F
## 76  ind_2200    M          F
## 78  ind_2216    M          F
## 79   ind_222    M          F
## 80  ind_2235    M          F
## 81   ind_225    M          F
## 82  ind_2284    M          F
## 83  ind_2293    M          F
## 84  ind_2328    M          F
## 85  ind_2330    M          F
## 86  ind_2346    M          F
## 87  ind_2350    M          F
## 88  ind_2393    M          F
## 89   ind_240    M          F
## 90  ind_2413    M          F
## 91  ind_2414    M          F
## 92  ind_2418    M          F
## 93  ind_2419    M          F
## 94  ind_2420    M          F
## 95  ind_2425    M          F
## 96  ind_2469    M          F
## 99  ind_2524    M          F
## 100 ind_2541    M          F
## 101 ind_2594    M          F
## 102 ind_2600    M          F
## 105 ind_2624    M          F
## 106  ind_263    M          F
## 107 ind_2648    M          F
## 108 ind_2669    M          F
## 109 ind_2682    M          F
## 110 ind_2714    M          F
## 111 ind_2730    M          F
## 112  ind_274    M          F
## 113 ind_2740    M          F
## 114 ind_2743    M          F
## 115 ind_2784    M          F
## 116 ind_2829    M          F
## 117 ind_2871    M          F
## 118 ind_2916    M          F
## 119  ind_296    M          F
## 121 ind_2993    M          F
## 122 ind_3023    M          F
## 123  ind_305    M          F
## 124 ind_3061    M          F
## 125  ind_309    M          F
## 126 ind_3099    M          F
## 127  ind_311    M          F
## 129 ind_3123    M          F
## 133 ind_3218    M          F
## 134  ind_324    M          F
## 137 ind_3253    M          F
## 140  ind_332    M          F
## 141 ind_3331    M          F
## 143 ind_3361    M          F
## 144 ind_3364    M          F
## 146   ind_34    M          F
## 147 ind_3401    M          F
## 148 ind_3436    M          F
## 149 ind_3443    M          F
## 150 ind_3502    M          F
## 151 ind_3506    M          F
## 152 ind_3518    M          F
## 153 ind_3525    M          F
## 154 ind_3542    M          F
## 155 ind_3552    M          F
## 159 ind_3606    M          F
## 161 ind_3624    M          F
## 162 ind_3625    M          F
## 164 ind_3647    M          F
## 165 ind_3668    M          F
## 167 ind_3687    M          F
## 168 ind_3706    M          F
## 169 ind_3742    M          F
## 170  ind_375    M          F
## 171 ind_3755    M          F
## 172 ind_3785    M          F
## 173 ind_3808    M          F
## 174  ind_382    M          F
## 176  ind_386    M          F
## 177   ind_39    M          F
## 178 ind_3902    M          F
## 179 ind_3945    M          F
## 180  ind_395    M          F
## 182 ind_3987    M          F
## 184 ind_4026    M          F
## 186 ind_4055    M          F
## 187 ind_4084    M          F
## 188 ind_4136    M          F
## 190 ind_4189    M          F
## 191 ind_4196    M          F
## 192  ind_421    M          F
## 193 ind_4212    M          F
## 194 ind_4217    M          F
## 195 ind_4220    M          F
## 196 ind_4257    M          F
## 197 ind_4280    M          F
## 199 ind_4292    M          F
## 201 ind_4295    M          F
## 202 ind_4297    M          F
## 203 ind_4307    M          F
## 204 ind_4309    M          F
## 205 ind_4342    M          F
## 206 ind_4343    M          F
## 207 ind_4357    M          F
## 208 ind_4377    M          F
## 209 ind_4379    M          F
## 210 ind_4393    M          F
## 211 ind_4394    M          F
## 212 ind_4395    M          F
## 213 ind_4398    M          F
## 214   ind_44    M          F
## 215  ind_443    M          F
## 216 ind_4443    M          F
## 218 ind_4451    M          F
## 219 ind_4456    M          F
## 220 ind_4516    M          F
## 221  ind_452    M          F
## 222 ind_4529    M          F
## 224 ind_4554    M          F
## 226 ind_4600    M          F
## 227 ind_4631    M          F
## 228 ind_4644    M          F
## 229 ind_4666    M          F
## 233 ind_4787    M          F
## 234 ind_4788    M          F
## 235 ind_4803    M          F
## 236 ind_4810    M          F
## 237 ind_4871    M          F
## 238 ind_4872    M          F
## 239 ind_4875    M          F
## 240 ind_4893    M          F
## 241 ind_4909    M          F
## 242 ind_4917    M          F
## 243 ind_4924    M          F
## 245 ind_4949    M          F
## 246 ind_4959    M          F
## 249 ind_4997    M          F
## 251 ind_5012    M          F
## 252 ind_5019    M          F
## 254 ind_5053    M          F
## 258 ind_5128    M          F
## 260  ind_519    M          F
## 261 ind_5192    M          F
## 262 ind_5194    M          F
## 263 ind_5203    M          F
## 266 ind_5251    M          F
## 267 ind_5262    M          F
## 268 ind_5273    M          F
## 269 ind_5283    M          F
## 270 ind_5292    M          F
## 271 ind_5305    M          F
## 272 ind_5349    M          F
## 273  ind_543    M          F
## 274 ind_5450    M          F
## 275 ind_5453    M          F
## 276 ind_5462    M          F
## 277 ind_5470    M          F
## 278 ind_5476    M          F
## 282 ind_5548    M          F
## 283  ind_555    M          F
## 285 ind_5614    M          F
## 286 ind_5644    M          F
## 287  ind_565    M          F
## 288 ind_5657    M          F
## 289  ind_566    M          F
## 291 ind_5676    M          F
## 292 ind_5761    M          F
## 296 ind_5849    M          F
## 297 ind_5896    M          F
## 298 ind_5941    M          F
## 299 ind_5943    M          F
## 301 ind_5966    M          F
## 302 ind_5973    M          F
## 306 ind_6026    M          F
## 310 ind_6154    M          F
## 311 ind_6155    M          F
## 312 ind_6158    M          F
## 313  ind_617    M          F
## 315 ind_6216    M          F
## 316 ind_6239    M          F
## 317  ind_624    M          F
## 318 ind_6242    M          F
## 319 ind_6243    M          F
## 321 ind_6267    M          F
## 322 ind_6292    M          F
## 323 ind_6321    M          F
## 324  ind_636    M          F
## 326 ind_6385    M          F
## 327  ind_639    M          F
## 329 ind_6417    M          F
## 330 ind_6422    M          F
## 331 ind_6437    M          F
## 333  ind_645    M          F
## 335 ind_6462    M          F
## 336 ind_6469    M          F
## 337   ind_65    M          F
## 338 ind_6523    M          F
## 341 ind_6551    M          F
## 343 ind_6618    M          F
## 346 ind_6764    M          F
## 347 ind_6804    M          F
## 348 ind_6808    M          F
## 349 ind_6838    M          F
## 351 ind_6927    M          F
## 352 ind_6955    M          F
## 354 ind_6979    M          F
## 355 ind_6982    M          F
## 356 ind_6987    M          F
## 358 ind_6993    M          F
## 359 ind_7049    M          F
## 360 ind_7054    M          F
## 361  ind_707    M          F
## 364 ind_7159    M          F
## 365 ind_7168    M          F
## 367 ind_7212    M          F
## 368 ind_7235    M          F
## 369 ind_7250    M          F
## 370 ind_7295    M          F
## 372 ind_7301    M          F
## 373 ind_7303    M          F
## 375 ind_7351    M          F
## 376 ind_7364    M          F
## 379 ind_7450    M          F
## 380  ind_746    M          F
## 381 ind_7466    M          F
## 383 ind_7470    M          F
## 384 ind_7486    M          F
## 386 ind_7493    M          F
## 387 ind_7528    M          F
## 388 ind_7555    M          F
## 389 ind_7556    M          F
## 390 ind_7568    M          F
## 391 ind_7606    M          F
## 392 ind_7609    M          F
## 393 ind_7634    M          F
## 394 ind_7640    M          F
## 395 ind_7645    M          F
## 396  ind_765    M          F
## 397 ind_7679    M          F
## 398 ind_7749    M          F
## 399 ind_7754    M          F
## 400 ind_7836    M          F
## 401 ind_7889    M          F
## 402 ind_7900    M          F
## 403 ind_7910    M          F
## 404 ind_7934    M          F
## 406 ind_7989    M          F
## 408 ind_8029    M          F
## 409 ind_8037    M          F
## 410 ind_8059    M          F
## 411 ind_8061    M          F
## 414 ind_8117    M          F
## 415 ind_8126    M          F
## 416 ind_8129    M          F
## 418 ind_8170    M          F
## 419 ind_8242    M          F
## 420 ind_8247    M          F
## 421 ind_8249    M          F
## 422 ind_8276    M          F
## 424 ind_8345    M          F
## 425 ind_8360    M          F
## 426  ind_838    M          F
## 429   ind_84    M          F
## 431 ind_8407    M          F
## 433 ind_8523    M          F
## 434 ind_8541    M          F
## 436 ind_8555    M          F
## 437 ind_8649    M          F
## 438 ind_8666    M          F
## 439 ind_8667    M          F
## 440 ind_8668    M          F
## 442 ind_8689    M          F
## 443 ind_8691    M          F
## 444 ind_8695    M          F
## 445 ind_8698    M          F
## 446  ind_871    M          F
## 447 ind_8758    M          F
## 448  ind_879    M          F
## 453 ind_8859    M          F
## 454 ind_8880    M          F
## 455 ind_8894    M          F
## 456 ind_8914    M          F
## 458 ind_8917    M          F
## 459 ind_8919    M          F
## 463 ind_8996    M          F
## 464 ind_9010    M          F
## 467 ind_9122    M          F
## 469 ind_9163    M          F
## 471  ind_930    M          F
## 472  ind_931    M          F
## 473  ind_947    M          F

# Calculer les statistiques des individus avec des prédictions incorrectes
statistiques_incorrect <- data.frame(
  sexe = c("F", "M"),
  nb_incorrect = c(nrow(prediction_incorrect_f), nrow(prediction_incorrect_m)),
  pourcentage_incorrect = c(nrow(prediction_incorrect_f) / nrow(data_16[data_16$sexe == "F",]) * 100,
                            nrow(prediction_incorrect_m) / nrow(data_16[data_16$sexe == "M",]) * 100)
)

# Graphique
stats_incorrect <- ggplot(statistiques_incorrect, aes(x = sexe, y = pourcentage_incorrect, fill = sexe)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = paste(round(pourcentage_incorrect, 2), "%")), 
            vjust = -0.5, 
            position = position_dodge(0.9)) +
  labs(title = "Pourcentage d'individus avec des prédictions incorrect pour chaque sexe",
       x = "Sexe",
       y = "Pourcentage d'individus avec des prédictions incorrect") +
  scale_fill_manual(values = c("F" = "#FF9999", "M" = "#99CCFF")) + 
  theme_minimal() + 
  theme(text = element_text(size = 12), 
        plot.title = element_text(hjust = 0.5), 
        panel.grid.major = element_line(color = "grey80"),  
        panel.grid.minor = element_blank())  

# Enregistrer le graphique
ggsave("statistique_incorrect.png", plot = stats_incorrect, width = 8, height = 6, units = "in", dpi = 300)

# Afficher le graphique
stats_incorrect

# Faire une bagging pour chaque méthode
bagging_glm_16 <- bootstrap(data_16, data_16, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "glm", b_rep = 100)
bagging_lda_16 <- bootstrap(data_16, data_16, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "lda", b_rep = 100)
bagging_rpart_16 <- bootstrap(data_16, data_16, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "rpart", b_rep = 100)
bagging_knn_16 <- bootstrap(data_16, data_16, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "knn", b_rep = 100)

bagging_glm_17 <- bootstrap(data_17, data_17, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "glm", b_rep = 100)
bagging_lda_17 <- bootstrap(data_17, data_17, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "lda", b_rep = 100)
bagging_rpart_17 <- bootstrap(data_17, data_17, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "rpart", b_rep = 100)
bagging_knn_17 <- bootstrap(data_17, data_17, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "knn", b_rep = 100)

bagging_glm_18 <- bootstrap(data_18, data_18, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "glm", b_rep = 100)
bagging_lda_18 <- bootstrap(data_18, data_18, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "lda", b_rep = 100)
bagging_rpart_18 <- bootstrap(data_18, data_18, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "rpart", b_rep = 100)
bagging_knn_18 <- bootstrap(data_18, data_18, colonnes_X = c("taille", "poids"), colonne_z = "sexe", methode = "knn", b_rep = 100)

Code SAE 5-03

AUFRERE Thibault, CHALLET Marie, CNOCKAERT Jules

Packages utilisés

Importation et nettoyage des données

Préparation des données avec interpolation et prédiction

Analyse des données

Classification

Préparation des données pour la classification

Fonctions pour la classification

Application de la classification