RでID番号シーケンスを持たないURLディレクトリのすべてのコンテンツをスクレイピングする最良の方法は何ですか? 内部のすべてのコンテンツを取得したいhttp://www.metalmusicarchives.com/album/
のですが、URL 形式はhttp://www.metalmusicarchives.com/album/[BAND NAME]/[ALBUM NAME]
そのディレクトリ内のすべてのコンテンツ用です。彼のアルバムディレクトリ内のすべての文字を説明しようとしましたが、
bandurls <- unlist(lapply(LETTERS, function(letter)
xpathSApply(htmlParse(paste0("http://www.metalmusicarchives.com/ListArtistsAlpha.aspx?letter=", letter)), '//div[@class="artistsListContainer"]/ul/li/a', xmlGetAttr, "href")
))
bands <- setNames(sub(".*/(.*)", "\\1", bandurls), bandurls)
albums <- sapply(bands, function(band) {
doc <- htmlParse(paste0("http://www.metalmusicarchives.com/artist/", band))
sapply(doc[paste0('//div[@class="discographyContainer"]/a[starts-with(@href,"/album/', band, '")]')], xmlGetAttr, "href")
})
URL <- sprintf("http://www.metalmusicarchives.com", albums)
METAL.SCRAPER <- function(ID) {
PaGE <- try(html(sprintf(URL, ID)), silent=TRUE)
if (inherits(PaGE, "try-error")) {
data.frame(Band=character(0), Year=character(0), Tracklist=character(0),Lineup=character(0),
Release=character(0), Genre=character(0), Rating=character(0))
} else {
data.frame(Band=PaGE %>% html_nodes(xpath='//head') %>% html_text(),
Year=PaGE %>% html_nodes(xpath='//h3[1]') %>% html_text(),
Tracklist=PaGE %>% html_nodes(xpath='//div[@id="albumInfosDetails"]') %>% html_text(),
Lineup=PaGE %>% html_nodes(xpath='//div[@id="albumInfosDetails"]') %>% html_text(),
Release=PaGE %>% html_nodes(xpath='//div[@id="albumInfosDetails"]') %>% html_text(),
Genre=PaGE %>% html_nodes(xpath='//span[@id="ctl00_MainContentPlaceHolder_AlbumInfosRepeater_ctl00_FiledUnderLabel"]') %>% html_text(),
Rating=PaGE %>% html_nodes(xpath='//span[@itemprop="average"]') %>% html_text(),
stringsAsFactors=FALSE)
}
}
Sys.sleep(2)
DaTa <- rbindlist(pblapply(URL, METAL.SCRAPER))
Warning messages:
1: In if (grepl("^http", x)) { ... :
the condition has length > 1 and only the first element will be used
2: In if (grepl("^http", x)) { ... :
the condition has length > 1 and only the first element will be used
3: In if (grepl("^http", x)) { ... :
the condition has length > 1 and only the first element will be used