将大列表的元素取消嵌套到数据框中

Unnesting elements of large list into dataframe

我目前正在处理从 crossref API 检索到的大型数据集,我在其中检索了基于 DOI 搜索的科学论文信息。

目前大列表包含约 3500 个元素。这些元素中的每一个都是它们自己的列表,由元数据 'meta'、实际相关数据 'data' 和无关列表 'facets'.

组成

这是基于两个 DOI 的两个列表的示例:

list(`10.1158/1055-9965.EPI-08-0303` = list(meta = NULL, data = structure(list(
    alternative.id = "10.1158/1055-9965.EPI-08-0303", container.title = "Cancer Epidemiology Biomarkers & Prevention", 
    created = "2008-11-06", deposited = "2020-12-24", published.print = "2008-11", 
    published.online = "2008-11-06", doi = "10.1158/1055-9965.epi-08-0303", 
    indexed = "2021-10-17", issn = "1055-9965,1538-7755", issue = "11", 
    issued = "2008-11", member = "1086", page = "3216-3223", 
    prefix = "10.1158", publisher = "American Association for Cancer Research (AACR)", 
    score = "1", source = "Crossref", reference.count = "31", 
    references.count = "31", is.referenced.by.count = "50", subject = "Oncology,Epidemiology", 
    title = "20 Years into the Gambia Hepatitis Intervention Study: Assessment of Initial Hypotheses and Prospects for Evaluation of Protective Effectiveness Against Liver Cancer", 
    type = "journal-article", url = "http://dx.doi.org/10.1158/1055-9965.epi-08-0303", 
    volume = "17", language = "en", short.container.title = "Cancer Epidemiol Biomarkers Prev", 
    author = list(structure(list(given = c("Simonetta", "Patrizia", 
    "Ebrima", "Andrew J.", "Gregory D.", "Maimuna", "Ruggero", 
    "Amelie", "Omar", "Marianne", "Hilton", "Pierre"), family = c("Viviani", 
    "Carrieri", "Bah", "Hall", "Kirk", "Mendy", "Montesano", 
    "Plymoth", "Sam", "Van der Sande", "Whittle", "Hainaut"), 
        sequence = c("first", "additional", "additional", "additional", 
        "additional", "additional", "additional", "additional", 
        "additional", "additional", "additional", "additional"
        )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
    -12L))), link = list(structure(list(URL = "https://syndication.highwire.org/content/doi/10.1158/1055-9965.EPI-08-0303", 
        content.type = "unspecified", content.version = "vor", 
        intended.application = "similarity-checking"), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -1L)))), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -1L)), facets = NULL), 
    `10.1016/j.canlet.2007.10.044` = list(meta = NULL, data = structure(list(
        alternative.id = "S0304383507005253", container.title = "Cancer Letters", 
        created = "2008-01-14", deposited = "2019-01-01", published.print = "2008-03", 
        doi = "10.1016/j.canlet.2007.10.044", indexed = "2021-10-07", 
        issn = "0304-3835", issue = "1", issued = "2008-03", 
        member = "78", page = "21-25", prefix = "10.1016", publisher = "Elsevier BV", 
        score = "1", source = "Crossref", reference.count = "20", 
        references.count = "20", is.referenced.by.count = "71", 
        subject = "Cancer Research,Oncology", title = "Detection of R337H, a germline TP53 mutation predisposing to multiple cancers, in asymptomatic women participating in a breast cancer screening program in Southern Brazil", 
        type = "journal-article", url = "http://dx.doi.org/10.1016/j.canlet.2007.10.044", 
        volume = "261", language = "en", short.container.title = "Cancer Letters", 
        author = list(structure(list(given = c("Edenir Inêz", 
        "Lavínia", "Maira", "Maria Isabel Waddington", "Magali", 
        "Ghyslaine", "Virginie", "Ernestina", "Juliana", "Ingrid Petroni", 
        "Roberto", "Pierre", "Patricia"), family = c("Palmero", 
        "Schüler-Faccini", "Caleffi", "Achatz", "Olivier", "Martel-Planche", 
        "Marcel", "Aguiar", "Giacomazzi", "Ewald", "Giugliani", 
        "Hainaut", "Ashton-Prolla"), sequence = c("first", "additional", 
        "additional", "additional", "additional", "additional", 
        "additional", "additional", "additional", "additional", 
        "additional", "additional", "additional")), class = c("tbl_df", 
        "tbl", "data.frame"), row.names = c(NA, -13L))), link = list(
            structure(list(URL = c("https://api.elsevier.com/content/article/PII:S0304383507005253?httpAccept=text/xml", 
            "https://api.elsevier.com/content/article/PII:S0304383507005253?httpAccept=text/plain"
            ), content.type = c("text/xml", "text/plain"), content.version = c("vor", 
            "vor"), intended.application = c("text-mining", "text-mining"
            )), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
            -2L))), license = list(structure(list(date = "2008-03-01", 
            content.version = "tdm", delay.in.days = 0L, URL = "https://www.elsevier.com/tdm/userlicense/1.0/"), class = c("tbl_df", 
        "tbl", "data.frame"), row.names = c(NA, -1L))), reference = list(
            structure(list(key = c("10.1016/j.canlet.2007.10.044_bib1", 
            "10.1016/j.canlet.2007.10.044_bib2", "10.1016/j.canlet.2007.10.044_bib3", 
            "10.1016/j.canlet.2007.10.044_bib4", "10.1016/j.canlet.2007.10.044_bib5", 
            "10.1016/j.canlet.2007.10.044_bib6", "10.1016/j.canlet.2007.10.044_bib7", 
            "10.1016/j.canlet.2007.10.044_bib8", "10.1016/j.canlet.2007.10.044_bib9", 
            "10.1016/j.canlet.2007.10.044_bib10", "10.1016/j.canlet.2007.10.044_bib11", 
            "10.1016/j.canlet.2007.10.044_bib12", "10.1016/j.canlet.2007.10.044_bib13", 
            "10.1016/j.canlet.2007.10.044_bib14", "10.1016/j.canlet.2007.10.044_bib15", 
            "10.1016/j.canlet.2007.10.044_bib16", "10.1016/j.canlet.2007.10.044_bib17", 
            "10.1016/j.canlet.2007.10.044_bib18", "10.1016/j.canlet.2007.10.044_bib19", 
            "10.1016/j.canlet.2007.10.044_bib20"), doi.asserted.by = c("crossref", 
            "crossref", NA, NA, NA, "crossref", NA, NA, "crossref", 
            "crossref", "crossref", NA, NA, NA, "crossref", "crossref", 
            "crossref", "crossref", "crossref", NA), first.page = c("96", 
            "266", "1298", "877s", "1932", "12", "10", "608", 
            "2658", "1954", "133", "1703", "1365", "5358", "1215", 
            "607", "647", "607", "9330", "1213"), DOI = c("10.1016/j.canlet.2005.12.039", 
            "10.1159/000154228", NA, NA, NA, "10.1038/nsb730", 
            NA, NA, "10.1002/1097-0142(19901215)66:12<2658::AID-CNCR2820661232>3.0.CO;2-C", 
            "10.1038/sj.onc.1207305", "10.1016/0165-4608(93)90166-J", 
            NA, NA, NA, "10.1093/nar/16.3.1215", "10.1002/humu.10081", 
            "10.1590/S0004-27302004000500009", "10.1007/BF00202835", 
            "10.1073/pnas.161479898", NA), article.title = c("The TP53 mutation, R337H, is associated with Li–Fraumeni and Li–Fraumeni-like syndromes in Brazilian families", 
            "Is p53 polymorphism maintained by natural selection?", 
            "Prevalence and diversity of constitutional mutations in the p53 gene among 21 Li–Fraumeni families", 
            "Breast cancer screening in 10,000 women of an underserved population in South Brazil: The NMAMAPOA cohort", 
            "P53 germline mutations in childhood cancers and cancer risk for carrier individuals", 
            "A novel mechanism of tumorigenesis involving pH-dependent destabilization of a mutant p53 tetramer", 
            "Germline mutations in the TP53 gene", "Germ-line p53 mutations in 15 families with Li–Fraumeni syndrome", 
            "Choroid plexus tumors in the breast cancer-sarcoma syndrome", 
            "A TP53 polymorphism is associated with increased risk of colorectal cancer and with reduced levels of TP53 mRNA", 
            "Wilms’ tumor in the Li–Fraumeni cancer family syndrome", 
            "Simple sequence repeat polymorphism within the p53 gene", 
            "Rhabdomyosarcoma in children: epidemiologic study and identification of a familial cancer syndrome", 
            "A cancer family syndrome in twenty-four kindreds", 
            "A simple salting out procedure for extracting DNA from human nucleated cells", 
            "The IARC TP53 database: new online mutation analysis and recommendations to users", 
            "Founder effect for the highly prevalent R337H mutation of tumor suppressor p53 in Brazilian patients with adrenocortical tumors", 
            "Identification of a polymorphism in intron 2 of the p53 gene", 
            "An inherited p53 mutation that contributes in a tissue-specific manner to pediatric adrenal cortical carcinoma", 
            "Cancer in survivors of childhood soft tissue sarcoma and their relatives"
            ), volume = c("245", "44", "54", "23", "82", "9", 
            "25", "56", "66", "23", "67", "8", "43", "48", "16", 
            "19", "48", "93", "98", "79"), author = c("Achatz", 
            "Beckman", "Birch", "Caleffi", "Chompret", "DiGiammarino", 
            "Eeles", "Frebourg", "Garber", "Gemignani", "Hartley", 
            "Lazar", "Li", "Li", "Miller", "Olivier", "Pinto", 
            "Pleasants", "Ribeiro", "Strong"), year = c("2007", 
            "1994", "1994", "2005", "2000", "2002", "1995", "1995", 
            "1990", "2004", "1993", "1993", "1969", "1988", "1988", 
            "2002", "2004", "1994", "2001", "1987"), journal.title = c("Cancer Lett.", 
            "Hum. Hered.", "Cancer Res.", "J. Clin. Oncol.", 
            "Br. J. Cancer", "Nat. Struct. Biol.", "Cancer Surv.", 
            "Am. J. Hum. Genet.", "Cancer", "Oncogene", "Cancer Genet. Cytogenet.", 
            "Oncogene", "J. Natl. Cancer Inst.", "Cancer Res.", 
            "Nucleic Acids Res.", "Hum. Mutat.", "Arq. Bras. Endocrinol. Metabol.", 
            "Hum. Genet.", "Proc. Natl. Acad. Sci. USA", "J. Natl. Cancer Inst."
            ), issue = c(NA, "5", "5", "16 S", "12", "1", NA, 
            "3", "12", "10", "2", "6", "6", "18", "3", "6", "5", 
            "5", "16", "6")), class = c("tbl_df", "tbl", "data.frame"
            ), row.names = c(NA, -20L)))), class = c("tbl_df", 
    "tbl", "data.frame"), row.names = c(NA, -1L)), facets = NULL))

'data' 下的所有数据都与我相关,我想取消嵌套并构建一个大型数据框,DOI 在一列中,'data' 下的数据在其他列中。

我试过使用 unnest 代码 unnest(data) 但结果是 Error in UseMethod("unnest") : no applicable method for 'unnest' applied to an object of class "list"

有没有简单的方法来做到这一点?

像这样?注意 - 最好包含一个包含玩具数据集的 Minimal reprex,而不是您所拥有内容的快照。这样问题可能会更快得到答案。

ll <- list(`10.1016/j.ejca.2017.11.029` = list(metadata = NULL,
     data = tibble(one = 1, two = 2)),
     `10.1016/j.ejca.2017.12.500` = list(metadata = NULL,
                 data = tibble(one = 3, two = 4)))
nms <- names(ll)
vals <- lapply(ll, `[[`, 2)

tibble::tibble(DOI = nms,
       data = vals)

# or shorten to
tibble::tibble(DOI = names(ll),
               data = lapply(ll, `[[`, 2))

# A tibble: 2 x 2
  DOI                        data            
  <chr>                      <named list>    
1 10.1016/j.ejca.2017.11.029 <tibble [1 x 2]>
2 10.1016/j.ejca.2017.12.500 <tibble [1 x 2]>

解释 > 这里的名字包含信息。相关信息是第二个列表元素。 lapply(ll, [[, 2) 等同于 c(ll[[1]][[2]], ll[[2]][[2]], ... )