r 在自我的婚姻中创造活着的哥哥姐姐
r creating older siblings alive at marriage of the ego
这里是我之前问题的一个更好的版本:我有一个数据集的母亲(mid
),child人(id
)和出生([=这些 child 人的 14=])、死亡 (dead
) 和结婚日期 (Marriage_Date
)。 ord
给出出生顺序,tstart
和 tstop
被观察个体的进入、退出年份。 tstart3
和 tstop3
表示每个观察区间的人的年龄。观察的时间是生存分析格式,意味着它每年都会被削减(cut in spells)。
在 children 中,我想创建一个列,其中包含时变数量的哥哥姐姐,如果可能的话,将男性和女性分开,当自我 (id
) 处于低位时,他们还活着观察(结果是他或她结婚时——这是我分析的事件)。
数据框本身不应更改,因为它是生存分析所需的格式。应该改变的是每个人在 tstop 还活着的兄弟姐妹的数量。如果一个兄弟姐妹出生于一个已经存在的 child,新的列 oldersiblings 得到一个 + 1,下一个加一,直到母亲的 id 再次改变。如果一个兄弟姐妹死了,- 1 直到下一个兄弟姐妹出生。
因此,在我看来,解决方案必须类似于按 mid
(和 id
?)
分组的 for 循环
这里是 dput 格式的数据提取:
structure(list(id = c("1799939", "1799939", "1799939", "1799939",
"1799939", "1799939", "1799939", "1132576", "1132576", "1132576",
"1132576", "1132576", "1132576", "1132576", "1132576", "1112778",
"1112778", "1112778", "1112778", "1112778", "1112778", "1112778",
"1112778", "1112778", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1933509",
"1933509", "1933509", "1933509", "1933509", "1933509", "1933509",
"1933509", "1933509", "1097672", "1097672", "1097672", "1097672",
"1097672", "1097672", "1097672", "1097672", "1097672", "1097672",
"1097672", "1097672", "1097672", "1097672", "1039958", "1039958",
"1039958", "1039958", "1039958", "1039958", "1039958"), mid = c("1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004241", "1004241", "1004241", "1004241", "1004241",
"1004241", "1004241"), born = c(1813.5698630137, 1813.5698630137,
1813.5698630137, 1813.5698630137, 1813.5698630137, 1813.5698630137,
1813.5698630137, 1815.48767123288, 1815.48767123288, 1815.48767123288,
1815.48767123288, 1815.48767123288, 1815.48767123288, 1815.48767123288,
1815.48767123288, 1818.01095890411, 1818.01095890411, 1818.01095890411,
1818.01095890411, 1818.01095890411, 1818.01095890411, 1818.01095890411,
1818.01095890411, 1818.01095890411, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1824.33333333333,
1824.33333333333, 1824.33333333333, 1824.33333333333, 1824.33333333333,
1824.33333333333, 1824.33333333333, 1824.33333333333, 1824.33333333333,
1826.89589041096, 1826.89589041096, 1826.89589041096, 1826.89589041096,
1826.89589041096, 1826.89589041096, 1826.89589041096, 1826.89589041096,
1826.89589041096, 1826.89589041096, 1826.89589041096, 1826.89589041096,
1826.89589041096, 1826.89589041096, 1828.28142076503, 1828.28142076503,
1828.28142076503, 1828.28142076503, 1828.28142076503, 1828.28142076503,
1828.28142076503), dead = c(1863.97808219178, 1863.97808219178,
1863.97808219178, 1863.97808219178, 1863.97808219178, 1863.97808219178,
1863.97808219178, 1844.07103825137, 1844.07103825137, 1844.07103825137,
1844.07103825137, 1844.07103825137, 1844.07103825137, 1844.07103825137,
1844.07103825137, 1890.85205479452, 1890.85205479452, 1890.85205479452,
1890.85205479452, 1890.85205479452, 1890.85205479452, 1890.85205479452,
1890.85205479452, 1890.85205479452, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1902.33150684932,
1902.33150684932, 1902.33150684932, 1902.33150684932, 1902.33150684932,
1902.33150684932, 1902.33150684932, 1902.33150684932, 1902.33150684932,
1862.01643835616, 1862.01643835616, 1862.01643835616, 1862.01643835616,
1862.01643835616, 1862.01643835616, 1862.01643835616, 1862.01643835616,
1862.01643835616, 1862.01643835616, 1862.01643835616, 1862.01643835616,
1862.01643835616, 1862.01643835616, 1852.22950819672, 1852.22950819672,
1852.22950819672, 1852.22950819672, 1852.22950819672, 1852.22950819672,
1852.22950819672), Marriage_Date = c(1837.8602739726, 1837.8602739726,
1837.8602739726, 1837.8602739726, 1837.8602739726, 1837.8602739726,
1837.8602739726, 1838.3397260274, 1838.3397260274, 1838.3397260274,
1838.3397260274, 1838.3397260274, 1838.3397260274, 1838.3397260274,
1838.3397260274, 1844.91256830601, 1844.91256830601, 1844.91256830601,
1844.91256830601, 1844.91256830601, 1844.91256830601, 1844.91256830601,
1844.91256830601, 1844.91256830601, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1850.10684931507, 1850.10684931507,
1850.10684931507, 1850.10684931507, 1850.10684931507, 1850.10684931507,
1850.10684931507, 1850.10684931507, 1850.10684931507, 1857.06301369863,
1857.06301369863, 1857.06301369863, 1857.06301369863, 1857.06301369863,
1857.06301369863, 1857.06301369863, 1857.06301369863, 1857.06301369863,
1857.06301369863, 1857.06301369863, 1857.06301369863, 1857.06301369863,
1857.06301369863, NA, NA, NA, NA, NA, NA, NA), start3 = c(0,
0.430136986301477, 1.43013698630148, 2.43013698630148, 3.43013698630148,
4.43013698630148, 5.43013698630148, 0, 0.512328767123336, 1.51232876712334,
2.51232876712334, 3.51232876712334, 4.51232876712334, 6.51232876712334,
5.51232876712334, 0, 0.989041095890343, 1.98904109589034, 2.98904109589034,
3.98904109589034, 4.98904109589034, 5.98904109589034, 6.98904109589034,
7.98904109589034, 0, 0.786885245901658, 1.78688524590166, 2.78688524590166,
3.78688524590166, 4.78688524590166, 5.78688524590166, 6.78688524590166,
7.78688524590166, 8.78688524590166, 9.78688524590166, 10.7868852459017,
11.7868852459017, 12.7868852459017, 13.7868852459017, 14.7868852459017,
15.7868852459017, 16.7868852459017, 17.7868852459017, 18.7868852459017,
19.7868852459017, 20.7868852459017, 21.7868852459017, 22.7868852459017,
23.7868852459017, 24.7868852459017, 25.7868852459017, 26.7868852459017,
27.7868852459017, 28.7868852459017, 29.7868852459017, 30.7868852459017,
31.7868852459017, 0, 0.666666666666742, 1.66666666666674, 2.66666666666674,
3.66666666666674, 4.66666666666674, 5.66666666666674, 6.66666666666674,
7.66666666666674, 0, 0.104109589041173, 1.10410958904117, 2.10410958904117,
3.10410958904117, 4.10410958904117, 5.10410958904117, 6.10410958904117,
7.10410958904117, 8.10410958904117, 9.10410958904117, 10.1041095890412,
12.1041095890412, 11.1041095890412, 0, 0.718579234972594, 1.71857923497259,
2.71857923497259, 3.71857923497259, 5.71857923497259, 4.71857923497259
), stop3 = c(0.430136986301477, 1.43013698630148, 2.43013698630148,
3.43013698630148, 4.43013698630148, 5.43013698630148, 6.2904109589042,
0.512328767123336, 1.51232876712334, 2.51232876712334, 3.51232876712334,
4.51232876712334, 5.51232876712334, 6.85205479452065, 6.51232876712334,
0.989041095890343, 1.98904109589034, 2.98904109589034, 3.98904109589034,
4.98904109589034, 5.98904109589034, 6.98904109589034, 7.98904109589034,
8.90160940190117, 0.786885245901658, 1.78688524590166, 2.78688524590166,
3.78688524590166, 4.78688524590166, 5.78688524590166, 6.78688524590166,
7.78688524590166, 8.78688524590166, 9.78688524590166, 10.7868852459017,
11.7868852459017, 12.7868852459017, 13.7868852459017, 14.7868852459017,
15.7868852459017, 16.7868852459017, 17.7868852459017, 18.7868852459017,
19.7868852459017, 20.7868852459017, 21.7868852459017, 22.7868852459017,
23.7868852459017, 24.7868852459017, 25.7868852459017, 26.7868852459017,
27.7868852459017, 28.7868852459017, 29.7868852459017, 30.7868852459017,
31.7868852459017, 32, 0.666666666666742, 1.66666666666674, 2.66666666666674,
3.66666666666674, 4.66666666666674, 5.66666666666674, 6.66666666666674,
7.66666666666674, 7.77351598173527, 0.104109589041173, 1.10410958904117,
2.10410958904117, 3.10410958904117, 4.10410958904117, 5.10410958904117,
6.10410958904117, 7.10410958904117, 8.10410958904117, 9.10410958904117,
10.1041095890412, 11.1041095890412, 12.1671232876713, 12.1041095890412,
0.718579234972594, 1.71857923497259, 2.71857923497259, 3.71857923497259,
4.71857923497259, 5.94808743169392, 5.71857923497259), ord = c("1",
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2",
"2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "1", "1", "1", "1", "1", "1", "1"), tstart = c(1831.5698630137,
1832, 1833, 1834, 1835, 1836, 1837, 1831.48767123288, 1832, 1833,
1834, 1835, 1836, 1838, 1837, 1836.01095890411, 1837, 1838, 1839,
1840, 1841, 1842, 1843, 1844, 1838.2131147541, 1839, 1840, 1841,
1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852,
1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863,
1864, 1865, 1866, 1867, 1868, 1869, 1870, 1842.33333333333, 1843,
1844, 1845, 1846, 1847, 1848, 1849, 1850, 1844.89589041096, 1845,
1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1857,
1856, 1846.28142076503, 1847, 1848, 1849, 1850, 1852, 1851),
tstop = c(1832, 1833, 1834, 1835, 1836, 1837, 1837.8602739726,
1832, 1833, 1834, 1835, 1836, 1837, 1838.3397260274, 1838,
1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1844.91256830601,
1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848,
1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858,
1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868,
1869, 1870, 1870.2131147541, 1843, 1844, 1845, 1846, 1847,
1848, 1849, 1850, 1850.10684931507, 1845, 1846, 1847, 1848,
1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857.06301369863,
1857, 1847, 1848, 1849, 1850, 1851, 1852.22950819672, 1852
)), .Names = c("id", "mid", "born", "dead", "Marriage_Date",
"start3", "stop3", "ord", "tstart", "tstop"), row.names = c(NA,
-87L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("mid",
"id"), drop = TRUE, indices = list(66:79, 15:23, 7:14, 24:56,
0:6, 57:65, 80:86), group_sizes = c(14L, 9L, 8L, 33L, 7L,
9L, 7L), biggest_group_size = 33L, labels = structure(list(mid = c("1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004241"
), id = c("1097672", "1112778", "1132576", "1667687", "1799939",
"1933509", "1039958")), row.names = c(NA, -7L), class = "data.frame", vars = c("mid",
"id"), drop = TRUE, .Names = c("mid", "id")))
我的许多试验之一是:
fam <- fam[
with(fam, order(mid, ord)),
]
#fam <- fam[ which(fam$mid > 1004211 & fam$mid < 1004246 ) ,]
fam <- group_by(fam, mid, id)
for(i in 1:nrow(fam)){
fam$older_sibs_alive[i] <- length(which(fam$mid==fam$mid[i]) & fam$id==fam$id[i] & lag(fam$dead) < fam$Marriage_Date[i])
}
当然不是我想要的
有谁有好的解决方案吗?非常感谢。
以下解决方案可以满足您的要求:
代码
library(dplyr)
fam %>%
ungroup() %>% # data provided was in a grouped_df, ungroup the data for a clean slate
select(sibling.id = id,
mid,
sibling.born = born,
sibling.dead = dead,
ord) %>% # select only the fields that we need to know about siblings
distinct(sibling.id,
mid,
sibling.born,
sibling.dead,
sibling.ord = as.numeric(ord)) %>% # select distinct siblings
inner_join(fam, by = c("mid" = "mid")) %>% # join siblings onto the original fam dataset
group_by_at(vars(one_of(colnames(fam)))) %>% # group by all the columns in the original fam dataset
summarise(n_older_siblings = #count the records where
sum(sibling.id != id & # the row is a sibling
sibling.born <= tstop & # the sibling has been born
sibling.dead > tstop & # the sibling is not yet dead
sibling.ord < as.numeric(ord))) %>% # the sibling was born first
select(-contains("sibling.")) -> fam # remove the sibling columns
说明
对于 data.frame
中的每条记录,我们需要了解此人的兄弟姐妹。因此,首先我们创建一个新的 tibble
,其中仅包含每个人的信息:id、mid、born、dead 和 ord。
fam %>%
select(sibling.id = id,
mid,
sibling.born = born,
sibling.dead = dead,
ord) %>%
distinct(sibling.id,
mid,
sibling.born,
sibling.dead,
sibling.ord = as.numeric(ord))
我们将这个新兄弟 tibble
加入到原始 fam
data.frame
中。结果是一个更大的 tibble
,其中每一行都是个人、时间和与个人相关的个人的唯一组合。如果一个人有 5 个兄弟姐妹,那么 fam
中的每一行现在将是 6 行。
inner_join(fam, by = c("mid" = "mid"))
我们想将其折叠回原来的行数。我们通过对数据进行分组和汇总来做到这一点。我们对原始 fam
data.frame
中的所有列进行分组,并总结出一堆布尔表达式的总和,这些布尔表达式编码了您要计算的条件。
group_by_at(vars(one_of(colnames(fam)))) %>%
summarise(n_older_siblings =
sum(sibling.id != id &
sibling.born <= tstop &
sibling.dead > tstop &
sibling.ord < as.numeric(ord))) %>%
group_by_at
允许我们用代码指定我们想要分组的列,具体的 vars(one_of(colnames(fam)))
实现意味着我们按 fam
data.frame
.
最后,取消选择包含 "sibling." 的列,因为这些是我们在开始时在同级 tibble
中创建的列。
这里是我之前问题的一个更好的版本:我有一个数据集的母亲(mid
),child人(id
)和出生([=这些 child 人的 14=])、死亡 (dead
) 和结婚日期 (Marriage_Date
)。 ord
给出出生顺序,tstart
和 tstop
被观察个体的进入、退出年份。 tstart3
和 tstop3
表示每个观察区间的人的年龄。观察的时间是生存分析格式,意味着它每年都会被削减(cut in spells)。
在 children 中,我想创建一个列,其中包含时变数量的哥哥姐姐,如果可能的话,将男性和女性分开,当自我 (id
) 处于低位时,他们还活着观察(结果是他或她结婚时——这是我分析的事件)。
数据框本身不应更改,因为它是生存分析所需的格式。应该改变的是每个人在 tstop 还活着的兄弟姐妹的数量。如果一个兄弟姐妹出生于一个已经存在的 child,新的列 oldersiblings 得到一个 + 1,下一个加一,直到母亲的 id 再次改变。如果一个兄弟姐妹死了,- 1 直到下一个兄弟姐妹出生。
因此,在我看来,解决方案必须类似于按 mid
(和 id
?)
这里是 dput 格式的数据提取:
structure(list(id = c("1799939", "1799939", "1799939", "1799939",
"1799939", "1799939", "1799939", "1132576", "1132576", "1132576",
"1132576", "1132576", "1132576", "1132576", "1132576", "1112778",
"1112778", "1112778", "1112778", "1112778", "1112778", "1112778",
"1112778", "1112778", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1667687",
"1667687", "1667687", "1667687", "1667687", "1667687", "1933509",
"1933509", "1933509", "1933509", "1933509", "1933509", "1933509",
"1933509", "1933509", "1097672", "1097672", "1097672", "1097672",
"1097672", "1097672", "1097672", "1097672", "1097672", "1097672",
"1097672", "1097672", "1097672", "1097672", "1039958", "1039958",
"1039958", "1039958", "1039958", "1039958", "1039958"), mid = c("1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004212",
"1004212", "1004241", "1004241", "1004241", "1004241", "1004241",
"1004241", "1004241"), born = c(1813.5698630137, 1813.5698630137,
1813.5698630137, 1813.5698630137, 1813.5698630137, 1813.5698630137,
1813.5698630137, 1815.48767123288, 1815.48767123288, 1815.48767123288,
1815.48767123288, 1815.48767123288, 1815.48767123288, 1815.48767123288,
1815.48767123288, 1818.01095890411, 1818.01095890411, 1818.01095890411,
1818.01095890411, 1818.01095890411, 1818.01095890411, 1818.01095890411,
1818.01095890411, 1818.01095890411, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1820.2131147541,
1820.2131147541, 1820.2131147541, 1820.2131147541, 1824.33333333333,
1824.33333333333, 1824.33333333333, 1824.33333333333, 1824.33333333333,
1824.33333333333, 1824.33333333333, 1824.33333333333, 1824.33333333333,
1826.89589041096, 1826.89589041096, 1826.89589041096, 1826.89589041096,
1826.89589041096, 1826.89589041096, 1826.89589041096, 1826.89589041096,
1826.89589041096, 1826.89589041096, 1826.89589041096, 1826.89589041096,
1826.89589041096, 1826.89589041096, 1828.28142076503, 1828.28142076503,
1828.28142076503, 1828.28142076503, 1828.28142076503, 1828.28142076503,
1828.28142076503), dead = c(1863.97808219178, 1863.97808219178,
1863.97808219178, 1863.97808219178, 1863.97808219178, 1863.97808219178,
1863.97808219178, 1844.07103825137, 1844.07103825137, 1844.07103825137,
1844.07103825137, 1844.07103825137, 1844.07103825137, 1844.07103825137,
1844.07103825137, 1890.85205479452, 1890.85205479452, 1890.85205479452,
1890.85205479452, 1890.85205479452, 1890.85205479452, 1890.85205479452,
1890.85205479452, 1890.85205479452, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1871.81643835616,
1871.81643835616, 1871.81643835616, 1871.81643835616, 1902.33150684932,
1902.33150684932, 1902.33150684932, 1902.33150684932, 1902.33150684932,
1902.33150684932, 1902.33150684932, 1902.33150684932, 1902.33150684932,
1862.01643835616, 1862.01643835616, 1862.01643835616, 1862.01643835616,
1862.01643835616, 1862.01643835616, 1862.01643835616, 1862.01643835616,
1862.01643835616, 1862.01643835616, 1862.01643835616, 1862.01643835616,
1862.01643835616, 1862.01643835616, 1852.22950819672, 1852.22950819672,
1852.22950819672, 1852.22950819672, 1852.22950819672, 1852.22950819672,
1852.22950819672), Marriage_Date = c(1837.8602739726, 1837.8602739726,
1837.8602739726, 1837.8602739726, 1837.8602739726, 1837.8602739726,
1837.8602739726, 1838.3397260274, 1838.3397260274, 1838.3397260274,
1838.3397260274, 1838.3397260274, 1838.3397260274, 1838.3397260274,
1838.3397260274, 1844.91256830601, 1844.91256830601, 1844.91256830601,
1844.91256830601, 1844.91256830601, 1844.91256830601, 1844.91256830601,
1844.91256830601, 1844.91256830601, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1850.10684931507, 1850.10684931507,
1850.10684931507, 1850.10684931507, 1850.10684931507, 1850.10684931507,
1850.10684931507, 1850.10684931507, 1850.10684931507, 1857.06301369863,
1857.06301369863, 1857.06301369863, 1857.06301369863, 1857.06301369863,
1857.06301369863, 1857.06301369863, 1857.06301369863, 1857.06301369863,
1857.06301369863, 1857.06301369863, 1857.06301369863, 1857.06301369863,
1857.06301369863, NA, NA, NA, NA, NA, NA, NA), start3 = c(0,
0.430136986301477, 1.43013698630148, 2.43013698630148, 3.43013698630148,
4.43013698630148, 5.43013698630148, 0, 0.512328767123336, 1.51232876712334,
2.51232876712334, 3.51232876712334, 4.51232876712334, 6.51232876712334,
5.51232876712334, 0, 0.989041095890343, 1.98904109589034, 2.98904109589034,
3.98904109589034, 4.98904109589034, 5.98904109589034, 6.98904109589034,
7.98904109589034, 0, 0.786885245901658, 1.78688524590166, 2.78688524590166,
3.78688524590166, 4.78688524590166, 5.78688524590166, 6.78688524590166,
7.78688524590166, 8.78688524590166, 9.78688524590166, 10.7868852459017,
11.7868852459017, 12.7868852459017, 13.7868852459017, 14.7868852459017,
15.7868852459017, 16.7868852459017, 17.7868852459017, 18.7868852459017,
19.7868852459017, 20.7868852459017, 21.7868852459017, 22.7868852459017,
23.7868852459017, 24.7868852459017, 25.7868852459017, 26.7868852459017,
27.7868852459017, 28.7868852459017, 29.7868852459017, 30.7868852459017,
31.7868852459017, 0, 0.666666666666742, 1.66666666666674, 2.66666666666674,
3.66666666666674, 4.66666666666674, 5.66666666666674, 6.66666666666674,
7.66666666666674, 0, 0.104109589041173, 1.10410958904117, 2.10410958904117,
3.10410958904117, 4.10410958904117, 5.10410958904117, 6.10410958904117,
7.10410958904117, 8.10410958904117, 9.10410958904117, 10.1041095890412,
12.1041095890412, 11.1041095890412, 0, 0.718579234972594, 1.71857923497259,
2.71857923497259, 3.71857923497259, 5.71857923497259, 4.71857923497259
), stop3 = c(0.430136986301477, 1.43013698630148, 2.43013698630148,
3.43013698630148, 4.43013698630148, 5.43013698630148, 6.2904109589042,
0.512328767123336, 1.51232876712334, 2.51232876712334, 3.51232876712334,
4.51232876712334, 5.51232876712334, 6.85205479452065, 6.51232876712334,
0.989041095890343, 1.98904109589034, 2.98904109589034, 3.98904109589034,
4.98904109589034, 5.98904109589034, 6.98904109589034, 7.98904109589034,
8.90160940190117, 0.786885245901658, 1.78688524590166, 2.78688524590166,
3.78688524590166, 4.78688524590166, 5.78688524590166, 6.78688524590166,
7.78688524590166, 8.78688524590166, 9.78688524590166, 10.7868852459017,
11.7868852459017, 12.7868852459017, 13.7868852459017, 14.7868852459017,
15.7868852459017, 16.7868852459017, 17.7868852459017, 18.7868852459017,
19.7868852459017, 20.7868852459017, 21.7868852459017, 22.7868852459017,
23.7868852459017, 24.7868852459017, 25.7868852459017, 26.7868852459017,
27.7868852459017, 28.7868852459017, 29.7868852459017, 30.7868852459017,
31.7868852459017, 32, 0.666666666666742, 1.66666666666674, 2.66666666666674,
3.66666666666674, 4.66666666666674, 5.66666666666674, 6.66666666666674,
7.66666666666674, 7.77351598173527, 0.104109589041173, 1.10410958904117,
2.10410958904117, 3.10410958904117, 4.10410958904117, 5.10410958904117,
6.10410958904117, 7.10410958904117, 8.10410958904117, 9.10410958904117,
10.1041095890412, 11.1041095890412, 12.1671232876713, 12.1041095890412,
0.718579234972594, 1.71857923497259, 2.71857923497259, 3.71857923497259,
4.71857923497259, 5.94808743169392, 5.71857923497259), ord = c("1",
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2",
"2", "3", "3", "3", "3", "3", "3", "3", "3", "3", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"4", "4", "4", "4", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "1", "1", "1", "1", "1", "1", "1"), tstart = c(1831.5698630137,
1832, 1833, 1834, 1835, 1836, 1837, 1831.48767123288, 1832, 1833,
1834, 1835, 1836, 1838, 1837, 1836.01095890411, 1837, 1838, 1839,
1840, 1841, 1842, 1843, 1844, 1838.2131147541, 1839, 1840, 1841,
1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852,
1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863,
1864, 1865, 1866, 1867, 1868, 1869, 1870, 1842.33333333333, 1843,
1844, 1845, 1846, 1847, 1848, 1849, 1850, 1844.89589041096, 1845,
1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1857,
1856, 1846.28142076503, 1847, 1848, 1849, 1850, 1852, 1851),
tstop = c(1832, 1833, 1834, 1835, 1836, 1837, 1837.8602739726,
1832, 1833, 1834, 1835, 1836, 1837, 1838.3397260274, 1838,
1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1844.91256830601,
1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848,
1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858,
1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868,
1869, 1870, 1870.2131147541, 1843, 1844, 1845, 1846, 1847,
1848, 1849, 1850, 1850.10684931507, 1845, 1846, 1847, 1848,
1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857.06301369863,
1857, 1847, 1848, 1849, 1850, 1851, 1852.22950819672, 1852
)), .Names = c("id", "mid", "born", "dead", "Marriage_Date",
"start3", "stop3", "ord", "tstart", "tstop"), row.names = c(NA,
-87L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("mid",
"id"), drop = TRUE, indices = list(66:79, 15:23, 7:14, 24:56,
0:6, 57:65, 80:86), group_sizes = c(14L, 9L, 8L, 33L, 7L,
9L, 7L), biggest_group_size = 33L, labels = structure(list(mid = c("1004212",
"1004212", "1004212", "1004212", "1004212", "1004212", "1004241"
), id = c("1097672", "1112778", "1132576", "1667687", "1799939",
"1933509", "1039958")), row.names = c(NA, -7L), class = "data.frame", vars = c("mid",
"id"), drop = TRUE, .Names = c("mid", "id")))
我的许多试验之一是:
fam <- fam[
with(fam, order(mid, ord)),
]
#fam <- fam[ which(fam$mid > 1004211 & fam$mid < 1004246 ) ,]
fam <- group_by(fam, mid, id)
for(i in 1:nrow(fam)){
fam$older_sibs_alive[i] <- length(which(fam$mid==fam$mid[i]) & fam$id==fam$id[i] & lag(fam$dead) < fam$Marriage_Date[i])
}
当然不是我想要的
有谁有好的解决方案吗?非常感谢。
以下解决方案可以满足您的要求:
代码
library(dplyr)
fam %>%
ungroup() %>% # data provided was in a grouped_df, ungroup the data for a clean slate
select(sibling.id = id,
mid,
sibling.born = born,
sibling.dead = dead,
ord) %>% # select only the fields that we need to know about siblings
distinct(sibling.id,
mid,
sibling.born,
sibling.dead,
sibling.ord = as.numeric(ord)) %>% # select distinct siblings
inner_join(fam, by = c("mid" = "mid")) %>% # join siblings onto the original fam dataset
group_by_at(vars(one_of(colnames(fam)))) %>% # group by all the columns in the original fam dataset
summarise(n_older_siblings = #count the records where
sum(sibling.id != id & # the row is a sibling
sibling.born <= tstop & # the sibling has been born
sibling.dead > tstop & # the sibling is not yet dead
sibling.ord < as.numeric(ord))) %>% # the sibling was born first
select(-contains("sibling.")) -> fam # remove the sibling columns
说明
对于 data.frame
中的每条记录,我们需要了解此人的兄弟姐妹。因此,首先我们创建一个新的 tibble
,其中仅包含每个人的信息:id、mid、born、dead 和 ord。
fam %>%
select(sibling.id = id,
mid,
sibling.born = born,
sibling.dead = dead,
ord) %>%
distinct(sibling.id,
mid,
sibling.born,
sibling.dead,
sibling.ord = as.numeric(ord))
我们将这个新兄弟 tibble
加入到原始 fam
data.frame
中。结果是一个更大的 tibble
,其中每一行都是个人、时间和与个人相关的个人的唯一组合。如果一个人有 5 个兄弟姐妹,那么 fam
中的每一行现在将是 6 行。
inner_join(fam, by = c("mid" = "mid"))
我们想将其折叠回原来的行数。我们通过对数据进行分组和汇总来做到这一点。我们对原始 fam
data.frame
中的所有列进行分组,并总结出一堆布尔表达式的总和,这些布尔表达式编码了您要计算的条件。
group_by_at(vars(one_of(colnames(fam)))) %>%
summarise(n_older_siblings =
sum(sibling.id != id &
sibling.born <= tstop &
sibling.dead > tstop &
sibling.ord < as.numeric(ord))) %>%
group_by_at
允许我们用代码指定我们想要分组的列,具体的 vars(one_of(colnames(fam)))
实现意味着我们按 fam
data.frame
.
最后,取消选择包含 "sibling." 的列,因为这些是我们在开始时在同级 tibble
中创建的列。