Этот вопрос кажется мне настолько необоснованным, что я убежден, что упускаю что-то очевидное, но он не может этого найти. У меня есть tibble, первые 200 строк которого находятся в конце этого вопроса.

Код, который я пробовал, был такой:

record %>%
  group_by(samples, rep, bests) %>%
  summarise(prop = round(n()/samples, 2))

Однако это не дало ожидаемого результата. Вот что он сделал:

> record %>%
+   group_by(samples, rep, bests) %>%
+   summarise(prop = round(n()/samples, 2))# %>%
`summarise()` regrouping output by 'samples', 'rep', 'bests' (override with `.groups` argument)
# A tibble: 200 x 4
# Groups:   samples, rep, bests [41]
   samples   rep bests   prop
     <dbl> <dbl> <chr>  <dbl>
 1      10     1 Change   0.3
 2      10     1 Change   0.3
 3      10     1 Change   0.3
 4      10     1 Stay     0.6
 5      10     1 Stay     0.6
 6      10     1 Stay     0.6
 7      10     1 Stay     0.6
 8      10     1 Stay     0.6
 9      10     1 Stay     0.6
10      10     2 Change   0.5
# … with 190 more rows

И что надо было сделать:

> record %>%
+   group_by(samples, rep, bests) %>%
+   summarise(prop = round(n()/samples, 2))# %>%
`summarise()` regrouping output by 'samples', 'rep', 'bests' (override with `.groups` argument)
# A tibble: 4 x 4
# Groups:   samples, rep, bests [41]
   samples   rep bests   prop
     <dbl> <dbl> <chr>  <dbl>
 1      10     1 Change   0.3
 2      10     1 Stay     0.6
 3      10     2 Change   0.5
 4      10     2 Stay     0.5

Что я сделал не так? Разве summarising() не подводит итоги?

Мои данные:

record <- structure(list(samples = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), 
    rep = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 
    2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 
    4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 
    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 
    8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 
    10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 
    11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 
    13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 
    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 
    16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 
    17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 
    19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 
    20, 20, 21), bests = c("Change", "Stay", "Stay", "Stay", 
    "Change", "Stay", "Change", "Stay", "Stay", "Change", "Change", 
    "Stay", "Stay", "Change", "Change", "Stay", "Stay", "Stay", 
    "Change", "Change", "Stay", "Stay", "Change", "Stay", "Change", 
    "Change", "Change", "Change", "Change", "Change", "Change", 
    "Change", "Change", "Stay", "Change", "Change", "Change", 
    "Change", "Change", "Stay", "Stay", "Change", "Stay", "Stay", 
    "Change", "Change", "Change", "Change", "Change", "Stay", 
    "Change", "Stay", "Change", "Change", "Change", "Change", 
    "Stay", "Change", "Stay", "Stay", "Change", "Change", "Stay", 
    "Change", "Stay", "Change", "Stay", "Change", "Change", "Stay", 
    "Stay", "Change", "Change", "Stay", "Change", "Change", "Stay", 
    "Change", "Change", "Stay", "Change", "Change", "Stay", "Change", 
    "Change", "Change", "Change", "Change", "Change", "Change", 
    "Change", "Change", "Change", "Change", "Change", "Change", 
    "Change", "Stay", "Stay", "Change", "Stay", "Change", "Change", 
    "Change", "Stay", "Stay", "Change", "Stay", "Change", "Change", 
    "Change", "Change", "Change", "Change", "Stay", "Change", 
    "Stay", "Change", "Change", "Stay", "Change", "Change", "Change", 
    "Change", "Change", "Change", "Stay", "Change", "Change", 
    "Stay", "Change", "Stay", "Stay", "Change", "Stay", "Stay", 
    "Stay", "Change", "Change", "Stay", "Change", "Stay", "Stay", 
    "Stay", "Change", "Change", "Change", "Change", "Change", 
    "Stay", "Change", "Change", "Change", "Stay", "Change", "Change", 
    "Stay", "Change", "Stay", "Change", "Stay", "Change", "Stay", 
    "Change", "Change", "Change", "Change", "Change", "Change", 
    "Stay", "Stay", "Change", "Change", "Stay", "Stay", "Change", 
    "Change", "Stay", "Stay", "Change", "Change", "Stay", "Change", 
    "Stay", "Change", "Stay", "Stay", "Change", "Change", "Change", 
    "Change", "Change", "Stay", "Stay", "Change", "Stay", "Change", 
    "Stay", "Stay", "Change")), row.names = c(NA, -200L), class = c("tbl_df", 
"tbl", "data.frame"))
1
Érico Patto 23 Ноя 2020 в 01:30

1 ответ

Лучший ответ

Начиная с dplyr версии >= 1.0, если имеется более одной строки на группу, нет ограничений на summarise, чтобы возвращать только одну строку для каждой группы. Здесь, в коде OP, он делится на «образцы», который является полным столбцом, и это проблема. Мы можем summarise 'samples' как элемент first для 'samples' (без использования 'samples' в качестве группирующей переменной)

library(dplyr)
record %>%
    group_by(rep, bests) %>%
    summarise(samples = first(samples),
               prop = round(n()/samples, 2), .groups = 'drop')

-вывод

# A tibble: 41 x 4
#     rep bests  samples  prop
#   <dbl> <chr>    <dbl> <dbl>
# 1     1 Change      10   0.3
# 2     1 Stay        10   0.6
# 3     2 Change      10   0.5
# 4     2 Stay        10   0.5
# 5     3 Change      10   0.7
# 6     3 Stay        10   0.3
# 7     4 Change      10   0.9
# 8     4 Stay        10   0.1
# 9     5 Change      10   0.6
#10     5 Stay        10   0.4
# … with 31 more rows

Или другой вариант - сначала выполнить count, который эффективно суммирует уникальные строки, а затем создать пропорцию с mutate

record %>% 
    count(samples, rep, bests) %>%
    mutate(prop = round(n/samples, 2))
2
akrun 22 Ноя 2020 в 22:45