Ускорьте очистку веб-страниц с помощью нескольких браузеров Rselenium
Я использую Rselenium, чтобы удалить следующий веб-сайт: http://plovila.pomorstvo.hr/
Каждый раз, когда мне нужно ввести поле 'NIB', выполнить и удалить все данные. Я использую функцию Sys.time() несколько раз, поэтому мой код работает медленно (около 12 секунд для одного NIB). Мне нужно собрать около 200000 номеров NIB, что дает 30 дней на очистку.
Мне интересно, могу ли я открыть несколько браузеров локально или как-то в облаке и ускорить мой сценарий очистки.
Можно ли использовать параллельные вычисления, чтобы преодолеть эту проблему? У вас есть какие-нибудь предложения?
РЕДАКТИРОВАТЬ: я добавляю код:
library(XML)
library(RCurl)
library(RSelenium)
library(png)
library(imager)
library(RMySQL)
library(htmltab)
library(jsonlite)
library(rvest)
# function for waiting instead Sys.sleep()
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]",
iterations = 5){
counter <- 0
chk <- FALSE
while(!chk & counter <= iterations){
wait <- tryCatch(
remDr$findElement(using = "xpath",
xpath_check)$getElementText(),
# remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(),
error = function(e) print(paste0("Trazi dalje"))
)
if(wait == "Trazi dalje" ){
Sys.sleep(1L)
counter <- sum(counter, 1)
}else{
chk <- TRUE
}
}
}
# Start Selenium Server
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()
# Simulate browser session and fill out form
remDr$navigate("http://plovila.pomorstvo.hr/")
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement()
remDr$screenshot(display = TRUE)
# Scrap !
df <- list()
Porivni_uredjaji <- list()
Clanovi_posade <- list()
Vlasnici <- list()
Korisnici <- list()
df_2 <- list()
Tereti <- list()
pocetak <- 100000
kraj <- 100003
system.time(
for (i in pocetak:kraj){
remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement()
Sys.sleep(1L)
remDr$findElement(using = "xpath",
"//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i),
key = "enter"))
waitLoad()
remDr$screenshot(display = TRUE)
doc <- htmlParse(remDr$getPageSource()[[1]])
Sys.sleep(1L)
Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue)
Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue)
NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue)
Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue)
LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue)
br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue)
br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue)
x <- i-pocetak + 1
if (length(NIB)==0){
Pozivni_znak <- NA
df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak)
df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
}else{
remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement()
waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5)
doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8")
Sys.sleep(1L)
list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue)
if (length(list_a) >= 1){
Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value")
json <- paste0("[", '"', Namjena, '"', "]")
Namjena <- fromJSON(json)
Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE)
colnames(Namjena) <- "Namjena"
Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value")
json <- paste0("[", '"', Vrsta_plovila, '"', "]")
Vrsta_plovila <- fromJSON(json)
Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE)
colnames(Vrsta_plovila) <- "Vrsta_plovila"
Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value")
json <- paste0("[", '"', Model_plovila, '"', "]")
Model_plovila <- fromJSON(json)
Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE)
colnames(Model_plovila) <- "Model_plovila"
Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value")
json <- paste0("[", '"', Duljina_trupa, '"', "]")
Duljina_trupa <- fromJSON(json)
Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE)
colnames(Duljina_trupa) <- "Duljina_trupa"
Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value")
json <- paste0("[", '"', Sirina_trupa, '"', "]")
Sirina_trupa <- fromJSON(json)
Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE)
colnames(Sirina_trupa) <- "Sirina_trupa"
Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value")
json <- paste0("[", '"', Visina_trupa, '"', "]")
Visina_trupa <- fromJSON(json)
Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE)
colnames(Visina_trupa) <- "Visina_trupa"
Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value")
json <- paste0("[", '"', Gaz, '"', "]")
Gaz <- fromJSON(json)
Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE)
colnames(Gaz) <- "Gaz"
Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value")
json <- paste0("[", '"', Nosivost, '"', "]")
Nosivost <- fromJSON(json)
Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE)
colnames(Nosivost) <- "Nosivost"
GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value")
json <- paste0("[", '"', GT, '"', "]")
GT <- fromJSON(json)
GT <- as.data.frame(GT, stringsAsFactors = FALSE)
colnames(GT) <- "GT"
Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value")
json <- paste0("[", '"', Snaga_motora, '"', "]")
Snaga_motora <- fromJSON(json)
Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE)
colnames(Snaga_motora) <- "Snaga_motora"
Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value")
Brodogradiliste <- gsub("\"", "'", Brodogradiliste)
json <- paste0("[", '"', Brodogradiliste, '"', "]")
Brodogradiliste <- fromJSON(json)
Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE)
Encoding(Brodogradiliste[,c(1)]) <- "UTF-8"
colnames(Brodogradiliste) <- "Brodogradiliste"
Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value")
json <- paste0("[", '"', Godina_gradnje, '"', "]")
Godina_gradnje <- fromJSON(json)
Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE)
colnames(Godina_gradnje) <- "Godina_gradnje"
Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value")
json <- paste0("[", '"', Materijal, '"', "]")
Materijal <- fromJSON(json)
Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE)
colnames(Materijal) <- "Materijal"
Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value")
json <- paste0("[", '"', Najveci_broj_osoba, '"', "]")
Najveci_broj_osoba <- fromJSON(json)
Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE)
colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba"
Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value")
json <- paste0("[", '"', Najveci_broj_putnika, '"', "]")
Najveci_broj_putnika <- fromJSON(json)
Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE)
colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika"
Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value")
json <- paste0("[", '"', Najmanji_broj_posade, '"', "]")
Najmanji_broj_posade <- fromJSON(json)
Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE)
colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade"
Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value")
json <- paste0("[", '"', Prethodna_oznaka, '"', "]")
Prethodna_oznaka <- fromJSON(json)
Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE)
colnames(Prethodna_oznaka) <- "Prethodna_oznaka"
Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value")
Prethodna_luka <- gsub("\"", "'", Prethodna_luka)
json <- paste0("[", '"', Prethodna_luka, '"', "]")
Prethodna_luka <- fromJSON(json)
Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE)
colnames(Prethodna_luka) <- "Prethodna_luka"
Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value")
json <- paste0("[", '"', Prethodna_drĹľava, '"', "]")
Prethodna_drĹľava <- fromJSON(json)
Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE)
colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava"
df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila,
Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT,
Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba,
Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka,
Prethodna_luka, Prethodna_drĹľava)
df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
df_2 <- readHTMLTable(doc)
Sys.sleep(2L)
Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB)
}}
}
)
# manipulate data after scraping
for (i in 1:length(df)){
if (length(df[[i]]) < 13){
df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26)
df[[i]] <- as.data.frame(df[[i]])
colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena",
"Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa",
"Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje",
"Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade",
"Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava")
}
}
df_final <- do.call(rbind, df)
df_final_1 <- df_final[!is.na(df_final$NIB), ]
РЕДАКТИРОВАТЬ 2: у меня есть проблема с кодом выше вы разместили. Если я бегу:
(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
clusterEvalQ(cl, {
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()
})
myTitles <- c()
ws <- foreach(x = 1:length(urls),
.packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl")) %dopar% {
remDr$navigate(urls[x])
Sys.sleep(3L)
remDr$getTitle()[[1]]
}
возвращает ошибку
Error in { : task 1 failed - " Summary: UnknownError
Detail: An unknown server-side error occurred while processing the command.
Further Details: run errorDetails method"
1 ответ
Возможно проблема с chrome:3.5.0 образ докера. Следующее работает для меня на win 10 с докером инструментов:
library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)
# using docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
# in windows
URLsPar <- c("https://stackru.com/", "https://github.com/",
"http://www.bbc.com/", "http://www.google.com",
"https://www.r-project.org/", "https://cran.r-project.org",
"https://twitter.com/", "https://www.facebook.com/")
appHTML <- c()
(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
clusterEvalQ(cl, {
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L,
browserName = "chrome")
remDr$open()
})
ws <- foreach(x = 1:length(URLsPar),
.packages = c("rvest", "magrittr", "RSelenium")) %dopar% {
print(URLsPar[x])
remDr$navigate(URLsPar[x])
remDr$getTitle()[[1]]
}
> ws
[[1]]
[1] "Stack Overflow - Where Developers Learn, Share, & Build Careers"
[[2]]
[1] "The world's leading software development platform · GitHub"
[[3]]
[1] "BBC - Homepage"
[[4]]
[1] "Google"
[[5]]
[1] "R: The R Project for Statistical Computing"
[[6]]
[1] "The Comprehensive R Archive Network"
[[7]]
[1] "Twitter. It's what's happening."
[[8]]
[1] "Facebook - Log In or Sign Up"
# close browser on each node
clusterEvalQ(cl, {
remDr$close()
})
stopImplicitCluster()