library(tidyverse)
library(magrittr)
library(glue)
library(stats)
library(tictoc)
library(gridExtra)
library(latex2exp)
library(tictoc)
library(gt)
library(ggforce)

8

Tenemos una muestra de t multivariadas:

\[ \mathbf{x}_i \overset{\text{iid}} \sim \mathcal{T}_{p,k} \\ \mathbf{x}_i = v^{1/2} \mathbf{z}_i \quad \mathbf{z}_i \overset{\text{iid}} \sim \mathcal{N}_p(\mathbf{0}, \mathbf{I}_p) \quad \frac{k}{v} \sim \chi^2_k \] \[ \mathbb{E}[\overline{\mathbf{x}}] = \mu \\ \mathbb{E}[\mathbf{S}] = \mathrm{Var}[\mathbf{x}] = \mathbb{E}v \cdot \Sigma = \frac{k}{k - 2} \cdot \mathbf{I}_p \quad k > 2 \]

Nos piden calcular, para cada muestra:

\[\begin{align*} & D = || \overline{\mathbf{x}} − μ||^2 \overset{\mathbb{E}} \longrightarrow \mathbf{0} & k > 1 \\ & D_1 = \mathrm{Tr} ( \mathbf{S} \Sigma^{−1} ) \overset{\mathbb{E}} \longrightarrow \frac{k}{k-2} \cdot p & k > 2 \\ & D_2 = \frac{ \mathrm{Tr} ( \mathbf{S} \Sigma^{−1} ) }{ \mathbb{E}[v] } \overset{\mathbb{E}} \longrightarrow p & k > 2 \end{align*}\]

Mini Ejemplo

Para cada escenario \((k, n)\), tendremos una matriz \(\mathbf{X} \in \mathbb{R}^{n \mathrm{x} 3}\), con los vectores \(\mathbf{x}_i\) como filas.

Acá, ejemplo con \(n = 10\) y \(k = 2\):

set.seed(12)

X <- mvtnorm::rmvt(10, df = 2, sigma = diag(3))
X
##             [,1]        [,2]       [,3]
##  [1,] -1.4180359  1.51055785 -0.9163365
##  [2,] -0.5277908 -1.14601218 -0.1562115
##  [3,] -0.7222954 -1.43899700 -0.2438519
##  [4,]  0.4085875 -0.74241942 -1.2351539
##  [5,] -0.7308800  0.01120533 -0.1428973
##  [6,] -0.7099990  1.19992315  0.3436754
##  [7,]  0.9065953 -0.52450843  0.3999310
##  [8,]  1.8145679  0.91485827 -0.2734319
##  [9,] -5.0040948 -1.30507268 -0.9718104
## [10,]  0.1017141  0.11309954  0.2808600

Calculamos el promedio de las \(\mathbf{x}_i\) con colMeans:

colMeans(X)
## [1] -0.5881631 -0.1407366 -0.2915227

Calculamos \(\mathbf{S}\) con cov:

cov(X)
##           [,1]      [,2]      [,3]
## [1,] 3.3025390 0.5819710 0.4396807
## [2,] 0.5819710 1.1429980 0.1003754
## [3,] 0.4396807 0.1003754 0.3328698

Una muestra \(\mathbf{x}_i \overset{\text{iid}} \sim \mathcal{T}_{3,k}\) (\(i = 1, \dots, n\)) para cada escenario \((n, k)\)

ESCENARIOS <-
  cross_df(list(
    n = c(10, 20, 50, seq(100, 1000, by = 50)),
    k = c(1, 2, 4, 10)
  ))

ESCENARIOS
SIGMA <- diag(3)
MU <- rep(0, 3)

traza <- function(A) { sum(diag(A)) }

simular <- function() {
  ESCENARIOS %>%
    mutate(
      # X = map2(n, k, ~ mvtnorm::rmvt(n = ..1, df = ..2, sigma = SIGMA)),
      X = map2(n, k, mvtnorm::rmvt, sigma = SIGMA),
      x_barra = map(X, colMeans),
      S = map(X, cov),
      D = map_dbl(x_barra, ~ sum((. - MU)^2)),
      D_1 = map_dbl(S, ~ traza(. %*% solve(SIGMA))),
      # Esperanza_v = map_dbl(k, ~ ifelse(. > 2, ./(. - 2), NA)),
      Esperanza_v = ifelse(k > 2, k/(k - 2), NA),
      D_2 = D_1/Esperanza_v
    ) %>%
    select(-X, -x_barra, -S)
}

tic()
escenarios <- simular()
toc()
## 0.141 sec elapsed
escenarios
escenarios %<>% mutate_if(is.double, round, 3)

hacer_tabla_D <- function(.escenarios) {
  .escenarios %>%
    select(n, k, D) %>%
    spread(n, D) %>%
    gt() %>%
    tab_header(
      title = "D → 0; si k > 1"
    ) %>%
    tab_spanner(
      columns = matches("[0-9]"),
      label = "n"
    ) %>%
    fmt_number(
      columns = matches("[0-9]"),
      decimals = 3
    )
}

hacer_tabla_D1 <- function(.escenarios) {
  .escenarios %>%
    select(n, k, D_1) %>%
    spread(n, D_1) %>%
    gt() %>%
    tab_header(
      title = "D1 → (k / (k - 2)) p; si k > 2"
    ) %>%
    tab_spanner(
      columns = matches("[0-9]"),
      label = "n"
    ) %>%
    fmt_number(
      columns = matches("[0-9]"),
      decimals = 3
    )
}

hacer_tabla_D2 <- function(.escenarios) {
  .escenarios %>%
    select(n, k, D_2) %>%
    spread(n, D_2) %>%
    gt() %>%
    tab_header(
      title = "D2 → p; si k > 2"
    ) %>%
    tab_spanner(
      columns = matches("[0-9]"),
      label = "n"
    ) %>%
    fmt_number(
      columns = matches("[0-9]"),
      decimals = 3
    ) %>%
    fmt_missing(
      columns = everything(),
      missing_text = "-"
    )
}
plot_facetado_k_n <- function(.escenarios) {
  .escenarios %>%
    mutate(k_str = fct_reorder(glue("k = {k}"), k)) %>%
    ggplot(aes(x = n)) +
    facet_grid(k_str ~ ., scales = "free_y")
}
graficar_D <- function(.escenarios) {
  .escenarios %>%
    plot_facetado_k_n() +
    aes(y = D) +
    geom_hline(yintercept = 0, color = "Grey40", linetype = "dotted") +
    geom_line(color = "Grey20") +
    geom_point() +
    labs(
      title = TeX("Evolución de $D = || \\bar{x} - \\mu ||^2 \\rightarrow 0$"),
      subtitle = "Converge para k > 1"
    )
}
graficar_D1 <- function(.escenarios) {
  plot_facetado_k_n(.escenarios) +
    aes(y = D_1) +
    geom_hline(
      mapping = aes(yintercept = Esperanza_v * 3),
      color = "ForestGreen", linetype = "dashed"
    ) +
    geom_point() +
    geom_line(color = "Grey50") +
    labs(
      title = TeX("Evolución de $D_1 = Tr ( S \\Sigma^{-1} ) \\rightarrow \\left( \\frac{k}{k - 2} \\right) \\cdot p$"),
      subtitle = "Converge si k > 2",
      y = TeX("$D_1$")
    )
}
graficar_D2 <- function(.escenarios) {
  .escenarios %>%
    filter(!is.na(D_2)) %>%
    plot_facetado_k_n() +
    aes(y = D_2) +
    geom_hline(yintercept = 3, color = "ForestGreen", linetype = "dashed") +
    geom_point() +
    geom_line(color = "Grey50") +
    # geom_segment(aes(xend = n, yend = 3), color = "Red4") +
    labs(
      title = TeX("Evolución de $D_2 = Tr ( S \\Sigma^{-1} ) / E(v) \\rightarrow  p$"),
      subtitle = TeX("Sólo definido si k > 2"),
      y = TeX("$D_2$")
    )
}
hacer_tabla_D(escenarios)
D → 0; si k > 1
k n
10 20 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000
1 3.366 7.452 1,305.005 7.187 19.073 2.594 0.361 68.846 1.097 0.804 3.298 0.696 138,001.372 0.911 2.692 0.653 2.109 54.338 8.149 1.514 1.270 3.616
2 0.843 0.402 0.221 0.590 0.046 0.106 0.022 0.026 0.035 0.050 0.038 0.015 0.010 0.056 0.005 0.223 0.036 0.024 0.002 0.012 0.078 0.007
4 0.257 0.219 0.037 0.000 0.056 0.032 0.021 0.029 0.019 0.009 0.022 0.057 0.003 0.009 0.010 0.003 0.004 0.010 0.003 0.003 0.010 0.001
10 0.939 0.155 0.030 0.016 0.014 0.006 0.005 0.012 0.005 0.001 0.000 0.002 0.006 0.004 0.017 0.011 0.011 0.005 0.006 0.014 0.002 0.000
graficar_D(escenarios)

\[ \frac{k}{k - 2} \cdot p = \frac{4}{4 - 2} \cdot 3 = 6 \\ \frac{k}{k - 2} \cdot p = \frac{10}{10 - 2} \cdot 3 = 3.75 \]

hacer_tabla_D1(escenarios)
D1 → (k / (k - 2)) p; si k > 2
k n
10 20 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000
1 23.753 95.266 55,857.677 296.382 3,346.131 652.498 1,017.422 14,150.050 724.330 138.844 1,017.177 1,033.436 75,949,605.605 1,251.231 2,787.765 634.055 1,126.324 30,951.765 7,469.481 1,777.867 3,210.761 2,689.247
2 10.332 6.325 13.785 130.084 12.875 17.515 14.361 20.054 26.693 43.589 18.840 20.651 18.073 24.850 25.980 222.668 20.551 17.277 22.421 21.234 50.643 32.950
4 8.011 5.763 4.437 7.039 5.460 6.799 5.262 4.811 5.718 5.434 5.516 5.600 5.951 6.213 5.178 5.856 5.483 6.311 5.546 6.597 6.229 5.774
10 2.649 3.710 2.737 3.051 3.958 3.940 3.703 3.807 3.837 3.619 3.673 3.668 4.082 3.673 3.696 3.765 3.856 3.644 3.729 3.794 3.617 3.622
graficar_D1(escenarios)
## Warning: Removed 44 rows containing missing values (geom_hline).

hacer_tabla_D2(escenarios)
D2 → p; si k > 2
k n
10 20 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000
1 - - - - - - - - - - - - - - - - - - - - - -
2 - - - - - - - - - - - - - - - - - - - - - -
4 4.006 2.881 2.219 3.520 2.730 3.400 2.631 2.405 2.859 2.717 2.758 2.800 2.976 3.106 2.589 2.928 2.742 3.156 2.773 3.298 3.114 2.887
10 2.119 2.968 2.190 2.441 3.166 3.152 2.963 3.046 3.070 2.895 2.938 2.934 3.265 2.939 2.957 3.012 3.085 2.915 2.983 3.035 2.894 2.898
graficar_D2(escenarios)

Consigna (b): \(1.000\) iteraciones y promediar para cada \((n, k)\)

NITER <- 1000

tic()

simulaciones <-
  tibble(
    iter = 1:NITER,
    simulacion = map(iter, ~ simular())
  ) %>%
  unnest()

toc()
## 70.32 sec elapsed
promedios <-
  simulaciones %>%
  group_by(n, k, Esperanza_v) %>%
  summarise(
    count = n(),
    D = mean(D),
    D_1 = mean(D_1),
    D_2 = mean(D_2)
  ) %>%
  mutate_if(is.double, round, 3) %>%
  ungroup()

promedios %>% arrange(k, n)
mensajito <- glue("Promedio de {NITER} iteraciones")

nota_al_pie <- function(data) {
  tab_source_note(
    data = data,
    source = mensajito
  )
}

caption <- labs(caption = mensajito)
hacer_tabla_D(promedios) %>% nota_al_pie()
D → 0; si k > 1
k n
10 20 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000
1 491.023 994.800 6,274.841 19,786.703 7,487.349 1,860.939 165,910.073 3,033.474 1,024,892.514 676.114 4,168.928 4,952.103 1,259.951 74,005.663 3,049.628 74,281.261 64,542.197 119,486.916 4,169.782 5,324,136.106 5,715.676 2,254.718
2 4.949 1.425 0.778 0.373 0.227 0.170 0.165 0.139 0.102 0.087 0.087 0.092 0.080 0.244 0.064 0.067 0.052 0.050 0.051 0.038 0.516 0.051
4 0.565 0.298 0.120 0.061 0.039 0.032 0.024 0.019 0.017 0.016 0.013 0.012 0.011 0.010 0.009 0.009 0.008 0.007 0.007 0.007 0.006 0.006
10 0.364 0.188 0.078 0.036 0.025 0.018 0.015 0.013 0.011 0.010 0.008 0.008 0.007 0.006 0.006 0.006 0.005 0.005 0.005 0.004 0.004 0.004
Promedio de 1000 iteraciones
graficar_D(promedios) + caption

hacer_tabla_D1(promedios) %>% nota_al_pie()
D1 → (k / (k - 2)) p; si k > 2
k n
10 20 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000
1 4,960.308 19,808.115 312,083.129 1,980,472.995 1,125,722.898 367,397.192 41,554,256.610 907,404.915 358,668,330.937 265,723.745 1,873,563.732 2,482,409.616 687,073.406 44,447,938.034 1,992,532.453 52,001,253.436 48,430,252.109 95,599,590.979 3,548,918.883 4,791,753,639.840 5,392,643.344 2,257,874.037
2 47.353 29.015 38.891 37.828 34.737 34.304 42.112 41.605 35.762 34.870 37.750 46.764 43.456 148.520 40.429 47.596 40.057 39.399 43.741 35.488 487.165 50.829
4 5.948 6.012 6.052 6.019 6.025 6.044 6.009 5.925 6.009 6.006 6.005 5.979 6.014 5.981 5.974 5.989 5.999 6.006 6.002 6.009 5.984 5.980
10 3.717 3.740 3.733 3.758 3.762 3.735 3.741 3.760 3.749 3.746 3.749 3.750 3.751 3.755 3.755 3.749 3.751 3.750 3.753 3.743 3.747 3.750
Promedio de 1000 iteraciones
graficar_D1(promedios) + caption
## Warning: Removed 44 rows containing missing values (geom_hline).

hacer_tabla_D2(promedios) %>% nota_al_pie()
D2 → p; si k > 2
k n
10 20 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000
1 - - - - - - - - - - - - - - - - - - - - - -
2 - - - - - - - - - - - - - - - - - - - - - -
4 2.974 3.006 3.026 3.009 3.012 3.022 3.005 2.963 3.005 3.003 3.003 2.989 3.007 2.991 2.987 2.995 3.000 3.003 3.001 3.005 2.992 2.990
10 2.974 2.992 2.986 3.006 3.009 2.988 2.993 3.008 3.000 2.996 2.999 3.000 3.000 3.004 3.004 2.999 3.001 3.000 3.003 2.995 2.998 3.000
Promedio de 1000 iteraciones
graficar_D2(promedios) + caption