1

RでID番号シーケンスを持たないURLディレクトリのすべてのコンテンツをスクレイピングする最良の方法は何ですか? 内部のすべてのコンテンツを取得したいhttp://www.metalmusicarchives.com/album/のですが、URL 形式はhttp://www.metalmusicarchives.com/album/[BAND NAME]/[ALBUM NAME]そのディレクトリ内のすべてのコンテンツ用です。彼のアルバムディレクトリ内のすべての文字を説明しようとしましたが、

bandurls <- unlist(lapply(LETTERS, function(letter)  
xpathSApply(htmlParse(paste0("http://www.metalmusicarchives.com/ListArtistsAlpha.aspx?letter=", letter)), '//div[@class="artistsListContainer"]/ul/li/a', xmlGetAttr, "href") 
))
bands <- setNames(sub(".*/(.*)", "\\1", bandurls), bandurls)

albums <- sapply(bands, function(band) {
doc <- htmlParse(paste0("http://www.metalmusicarchives.com/artist/", band))
sapply(doc[paste0('//div[@class="discographyContainer"]/a[starts-with(@href,"/album/', band, '")]')], xmlGetAttr, "href")
})


URL <- sprintf("http://www.metalmusicarchives.com", albums)

METAL.SCRAPER <- function(ID) {
  PaGE <- try(html(sprintf(URL, ID)), silent=TRUE)
  if (inherits(PaGE, "try-error")) {
    data.frame(Band=character(0), Year=character(0), Tracklist=character(0),Lineup=character(0),
           Release=character(0), Genre=character(0), Rating=character(0))
  } else {
    data.frame(Band=PaGE %>% html_nodes(xpath='//head') %>% html_text(),
           Year=PaGE %>% html_nodes(xpath='//h3[1]') %>% html_text(),
           Tracklist=PaGE %>% html_nodes(xpath='//div[@id="albumInfosDetails"]') %>% html_text(),
           Lineup=PaGE %>% html_nodes(xpath='//div[@id="albumInfosDetails"]') %>% html_text(),
           Release=PaGE %>% html_nodes(xpath='//div[@id="albumInfosDetails"]') %>% html_text(),
           Genre=PaGE %>% html_nodes(xpath='//span[@id="ctl00_MainContentPlaceHolder_AlbumInfosRepeater_ctl00_FiledUnderLabel"]') %>% html_text(),
           Rating=PaGE %>% html_nodes(xpath='//span[@itemprop="average"]') %>% html_text(),
           stringsAsFactors=FALSE)
 }
}

Sys.sleep(2)

DaTa <- rbindlist(pblapply(URL, METAL.SCRAPER))

Warning messages:
1: In if (grepl("^http", x)) { ... :
  the condition has length > 1 and only the first element will be used
2: In if (grepl("^http", x)) { ... :
  the condition has length > 1 and only the first element will be used
3: In if (grepl("^http", x)) { ... :
  the condition has length > 1 and only the first element will be used
4

1 に答える 1

1

理論的には、これをスクレイピングする 1 つの方法を次に示します。

library(XML)
bandurls <- unlist(lapply(LETTERS, function(letter)  
  xpathSApply(htmlParse(paste0("http://www.metalmusicarchives.com/ListArtistsAlpha.aspx?letter=", letter)), '//div[@class="artistsListContainer"]/ul/li/a', xmlGetAttr, "href") 
))
bands <- setNames(sub(".*/(.*)", "\\1", bandurls), bandurls)
albums <- sapply(bands, function(band) {
  doc <- htmlParse(paste0("http://www.metalmusicarchives.com/artist/", band))
  sapply(doc[paste0('//div[@class="discographyContainer"]/a[starts-with(@href, "/album/', band, '")]')], xmlGetAttr, "href")
})
albums
# $`/artist/a-band-called-pain`
# [1] "/album/a-band-called-pain/broken-dreams" "/album/a-band-called-pain/broken-dreams"
# 
# $`/artist/a-band-of-orcs`
# [1] "/album/a-band-of-orcs/warchiefs-of-the-apocalypse(ep)"
# [2] "/album/a-band-of-orcs/warchiefs-of-the-apocalypse(ep)"
# [3] "/album/a-band-of-orcs/hall-of-the-frozen-dead(single)"
# [4] "/album/a-band-of-orcs/hall-of-the-frozen-dead(single)"
# ...

ただし、サイトをスクレイピングする前に、 許可されているかどうかをウェブマスターに確認してください。\m/

于 2015-03-19T19:33:20.127 に答える