Объедините строки JSON с разным количеством столбцов, некоторые из которых не были помечены, используя melt или dplyr в R
Это связано с предыдущим вопросом. Однако проблема развилась. У меня есть данные JSON, которые находятся в трех столбцах: "Левый", "Kwic" и "Правый". Столбцы "Левый" и "Правый" иногда подразделяются. Это подразделение обозначается в файле JSON как "класс". Однако этот "класс" часто не имеет маркировки. В подразделенных столбцах всегда будет класс с именем "coll".
Превосходное решение, представленное ранее, заключалось в том, чтобы получить столбцы "pre" и "post" и переименовать их для включения в фрейм данных. Однако теперь у нас есть смесь столбцов, некоторые разделены, а некоторые нет.
То, что я хотел бы сделать, это взять неразделенные данные и добавить их в центральный столбец "coll". Это относится как к левому, так и к правому дивизионам. Однако сейчас я могу записать их только как отдельные столбцы. Я пробовал разные вещи с melt и dplyr2, но безрезультатно.
Данные:
structure(list(Left = list(structure(list(class = "", str = " children tend to view authority figures"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "children have a computer . Wireless resources"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "unclear if increases in physical activity are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "filtration pressure . Where recurrent disease is"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "strc", ""), str = c("multiply .", "</p><p>",
"When nevirapine is no longer")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = "", str = "white . We don't provide enough services ,"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = ", a sexually transmitted infection , are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "continuous lowgrade itching and linear lesions"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " radiation oncology community is largely"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "the variability in response time that was"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "incremental cost effectiveness ratio that is"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "Through the use of warming , acrid herbs"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "start using tobacco : psychosocial factors"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "determining the severity because the fetus was"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "coll", ""), str = c("This occurred despite the ",
"significantly", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = "mission to eliminate the suffering and death"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "are more likely to be present , or to be"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "demonstrated primarily pulmonary signs and symptoms"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "criminal involvement . These findings are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "model . There is a danger in using herbs"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class", "str"
), class = "data.frame", row.names = 1L), structure(list(class = "col0 coll",
str = " responsible"), .Names = c("class", "str"), class = "data.frame", row.names = 1L),
structure(list(class = "col0 coll", str = " present"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " responsible"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " consistent"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " responsible"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " less"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " less"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "higher"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " present"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " consistent"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Right = list(
structure(list(class = c("", "coll", ""), str = c(" ", "as",
" physicians and parents as legitimate")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "as", " radio / CD headsets , handheld televisions"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"to", " the physical environment itself , or")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "for", " blockage of lymphatic collaterals ,"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"in", " the blood , the HIV strains that are")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", "", "strc", ""), str = c(" ", "as",
" Spanish services . \"", "</p><p>", "She admits")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c(" ", "for", " the majority of cervical cancer cases"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", "", "strc", ""
), str = c(" ", "with", " vigorous scratching .", "</p><p>",
"Psoriasis")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
5L)), structure(list(class = c("", "coll", ""), str = c(" ",
"for", " having treated hundreds of thousands")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "to", " the distractor-ratio manipulation and"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"than", " £ 30 000 per quality adjusted life")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "as", " aconitum carmichaeli praeparatum ( fu"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"as", " personality or parental role modeling")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "than", " 28 weeks old , and the bilirubin had"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = " level of psychiatric symptoms observed "), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "coll", "", "strc", ""), str = c(" ", "to",
" all cancers by 2015 .", "</p><p>", "The primary")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c(" ", "in", " higher numbers , in sputum cultures "
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", "", "strc"), str = c(" ",
"as", " wheezing and shortness of breath .", "</p>")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 4L)), structure(list(
class = c("", "coll", ""), str = c(" ", "with", " those from DeLeon and Jainchill 's"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"as", " mahuang in highly concentrated extracts")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)))), .Names = c("Left",
"Kwic", "Right"), class = "data.frame", row.names = c(NA, 20L
))
Основная проблема заключается в том, что некоторые столбцы не имеют маркировки, но могут быть идентифицированы на основе структуры данных. Код ниже генерирует вывод, который почти у цели, но примирение с этим небольшим отличием привело меня к безумию.
Код:
## generate raw output
documentdata <- document$Lines[, c("Left", "Kwic", "Right")]
documentdata = cbind(documentdata,SeekID=query)
## generate tidied output
## generate left columns
docx <- melt(documentdata$Left, id.vars = c("class"))
pre <- which(docx$class %in% c("coll")) - 1
post <- which(docx$class %in% c("coll")) + 1
docx$class[pre] = "l.pre"
docx$class[post] = "l.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
names(docx)[names(docx)=="Var.3"] <- "l.full"
names(docx)[names(docx)=="coll"] <- "l.coll"
docx.left <- docx[, c("l.full", "l.pre", "l.coll", "l.post")]
#docx.left <- documentdata$Left %>% do.call(rbind, .) %>%
# do(data.frame(l.pre = .[["str"]][which(.[["class"]]=="coll")-1],
# l.coll = .[["str"]][which(.[["class"]]=="coll")],
# l.post = .[["str"]][which(.[["class"]]=="coll")+1]))
## generate center columns
docx <- melt(documentdata$Kwic, id.vars = c("class"))
names(docx)[names(docx)=="value"] <- "k.coll"
docx.kwic = docx[, c("k.coll"), drop = FALSE]
## generate right columns
docx <- melt(documentdata$Right, id.vars = c("class"))
post <- which(docx$class %in% c("coll")) + 1
docx$class[post] = "r.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
names(docx)[names(docx)=="coll"] <- "r.coll"
names(docx)[names(docx)=="Var.3"] <- "r.pre"
docx.right <- docx[, c("r.pre", "r.coll", "r.post")]
## final output
docx.output = cbind(docx.left, docx.kwic, docx.right)
docx.output = cbind(docx.output,SeekID=query)
docx.output <- docx.output[, c("SeekID", "l.full", "l.pre", "l.coll", "l.post", "k.coll", "r.pre", "r.coll", "r.post")]
Исправленный код (предоставлен @cgjeremy) [решено]
## general parsing function that handles "r" and "l" differently
myparse <- function(x, side){
if(any(x$class=="coll")){
pre <- x$str[which(x$class=="coll")-1]
coll <- x$str[which(x$class=="coll")]
post <- x$str[which(x$class=="coll")+1]
} else if(side=="l"){
pre <- paste0(x$str, collapse="")
coll <- ""
post <- ""
} else if(side=="r"){
pre <- ""
coll <- ""
post <- paste0(x$str, collapse="")
} else {
pre <- ""
coll <- ""
post <- ""
}
z <- data.frame(pre, coll, post)
names(z) <-c(paste0(side, ".pre"), paste0(side, ".coll"), paste0(side, ".post"))
z
}
## calls parsing function to generate left, mid, and right column-sets
library(dplyr)
left <- documentdata$Left %>% lapply(myparse, side="l") %>%
do.call(rbind, .)
mid <- do.call(rbind, documentdata$Kwic)$str
right <- documentdata$Right %>% lapply(myparse, side="r") %>%
do.call(rbind, .)
## combines left, mid, and right columns-sets to generate final output
docx.output <- cbind(left, mid, right)
Заранее спасибо за любую помощь или совет,
1 ответ
Я не совсем уверен в ваших правилах для правой стороны, но я думаю, что это то, что вы хотите.
Сначала давайте определим функцию синтаксического анализа:
myparse <- function(x, side){
if(any(x$class=="coll")){
pre <- x$str[which(x$class=="coll")-1]
coll <- x$str[which(x$class=="coll")]
post <- x$str[which(x$class=="coll")+1]
all <- ""
} else {
pre <- ""
coll <- ""
post <- ""
all <- paste0(x$str, collapse="")
}
z <- data.frame(pre, coll, post, all)
names(z) <-c(paste0(side, ".pre"), paste0(side, ".coll"), paste0(side, ".post"), paste0(side, ".all"))
z
}
Эта функция проверяет каждого члена documentdata$Left
или правильно, и если есть class=="coll"
мы разделяем, иначе мы вставляем все в all
колонка.
Чтобы запустить его один раз, попробуйте myparse(documentdata$Left[[1]], side="whatever")
Вы можете изменить 1 на других членов списка.
Тогда мы можем получить наши левый и правый, используя lapply
(который занимает myparse
и применяет его к каждому элементу списка), затем rbinding
список в data.frame. Середина легче:
library(dplyr)
left <- documentdata$Left %>% lapply(myparse, side="l") %>%
do.call(rbind, .)
mid <- do.call(rbind, documentdata$Kwic)$str
right <- documentdata$Right %>% lapply(myparse, side="r") %>%
do.call(rbind, .)
Тогда мы cbind
их вместе:
cbind(left, mid, right)