1

これはエクセルで簡単にできることです。しかし、私はRに当惑しています。

国名を文字列の長いリスト (「所属」) に割り当てたいと思います。

c("Department of Psychiatry and Behavioural Sciences, University College London Medical School, UK.", 
"", "Ty Dewi Sant School of Nursing, University Hospital of Wales, College of Medicine, Cardiff.", 
"University of Massachusetts Medical Center.", "Older Women's League.", 
"Kimberly Quality Care, Boston, MA.", "Michaux Manor Living Center, Fayetteville, PA.", 
"Florida Diagnostic and Learning Resources System, University of South Florida, Tampa 33613.", 
"", "Bigel Institute for Health Policy, Brandeis University, Waltham, MA.", 
"", "York Health Authority.", "Southern Illinois University, Edwardsville.", 
"St. Joseph's Hospital, Memphis, TN.", "Long Term Home Care of the Frail Elderly Foundation, New York City.", 
"Catholic University of America, Washington, DC.", "Mercy Health Center, Oklahoma City, OK.", 
"", "Visiting Nurse Service of New York.", "RespiteCare Center, Evanston, IL.", 
"Camden and Islington HA.", "National Advisory Council on Aging.", 
"Visiting Nurse Service of New York.", "American Health Care Association, Washington, DC.", 
"HealthCare Partners Medical Group, Los Angeles, CA 90015, USA.", 
"Tad Publishing Company, Peoria, IL, USA.", "Child Health Investment Partnership, Roanoke, VA, USA.", 
"School of Public Health, State University of New York, Albany 12237, USA.", 
"Bundoora Extended Care Centre.", "", "", "Family Respite Center, Falls Church, VA, USA.", 
"", "University of Victoria.", "", "Homemaker Health Aide Service of the National Capital Area.", 
"West Lambeth Health Authority, London SE1 7EH.", "Bon Secours Hospital/Villa Maria Nursing Center, North Miami, FL 33161.", 
"Alzheimer's Disease and Related Disorders Association, Syracuse, NY.", 
"Alzheimer's Association, Washington DC.", "South Carolina Commission on Aging, Columbia.", 
"University of New Mexico College of Nursing.", "Department of Human Development and Family Studies, University of Alabama, Tuscaloosa.", 
"Ballard Health Care Residence, Des Plaines, IL.", "Bowman Gray School of Medicine of Wake Forest University, Winston-Salem, NC.", 
"Case Western Reserve University.", "School of Public and Environmental Administration, Indiana University, Indianapolis 46202.", 
"Manor HealthCare Corp, Silver Spring, MD.", "Relationship Builders, Napa, CA.", 
"", "", "Medical University of South Carolina, USA.", "Tokyo Metropolitan Institute of Gerontology, Itabashi, Japan. tatsuro@tmig.or.jp", 
"Medical University of South Carolina, USA.", "Royal Hospital for Sick Children, Bristol.", 
"Barefield, Ennis, Co. Clare., Ireland.", "North Georgia College, Dahlonega 30597, USA.", 
"Institute for Psychology (I), University of Wurzburg, Germany.", 
"Camborne Redruth Community Hospital, Cornwall, United Kingdom.", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Institute of Child Health and Great Ormond Street Hospital for Children NHS Trust, London, UK.", 
"Department of Psychiatry, University of Toronto, Toronto, Ontario, Canada. carol.cohen@sunnybrook.on.ca", 
"Boston University School of Social Work, MA 02215, USA.", "", 
"Neurosciences Unit, General Infirmary at Leeds.", "", "", "School of Kang-Ning Junior College of Nursing, Nei-Hu, Taiwan, ROC.", 
"College of Nursing, South Dakota State University, USA.", "Department of Geriatric Medicine, University of Manchester, UK.", 
"Southern Illinois University, Department of Social Work, Edwardsville 62026-1450, USA.", 
"Redlands Community College, El Reno, Oklahoma, USA.", "", "", 
"Department of Geriatric Medicine, Alexandra Hospital, Singapore.", 
"School of Nursing and Midwifery, Department of Gerontological and Continuing Care Nursing, University of Sheffield, Sheffield, England. Liz.hanson@act.shef.ac.uk", 
"", "State University of New York, Health Science Center at Syracuse, 13210, USA. HAMR@mailbox.hscsyr.edu", 
"Div. of Active Palliative Care, Todachuo General Hospital.", 
"Children and Young People's Kidney Unit, Nottingham City Hospital, NHS Trust, UK.", 
"School of Nursing & Midwifery, Department of Gerontological & Continuing Care Nursing, University of Sheffield. liz.hanson@act.shef.ac.uk", 
"Harrington Memorial Hospital, Southbridge, MA, USA.", "", "Department of Curriculum and Instruction, Iowa State University, Ames, 50011. USA.", 
"Children & Young People's Kidney Unit, Nottingham City Hospital, U.K.", 
"School of Social Work, Boston University, MA 02215, USA. freedman@bu.edu", 
"Royal Free Hospital, London, UK.", "Humboldt State University, Department of Nursing, Arcata, CA, USA.", 
"Department of Psychiatry, The University of Queensland, Mental Health Centre, Royal Brisbane Hospital, Herston, Australia. davidk@psychiatry.uq.edu.au", 
"Centre for Evidence Based Nursing, University of York, Heslington, York, Nth Yorkshire, UK, YO1 5DG. cat4@york.ac.uk", 
"School of Nursing, University of British Columbia, Vancouver. magenta@bc.sympatico.ca", 
"Medisinsk avdeling, Lovisenberg Diakonale Sykehus, Oslo.", "School of Nursing, Yale University, USA.", 
"Centre de la Mémoire, Hôpital Roger Salengro, Centre Hospitalier Universitaire, Lille.", 
"University of Ulster and Eastern Health and Social Services Board, Ulster, Northern Ireland. r.mcconkey@ulst.ac.uk", 
"Thames Valley Family Practice Research Unit, Department of Family Medicine's Centre for Studies in Family Medicine, University of Western Ontario (UWO), London. jbbrown@julian.uwo.ca", 
"", "", "Department of Special Education, University of Nijmegen, The Netherlands. A.Hendriks@ped.kun.nl", 
"European Institute of Health and Medical Sciences, University of Surrey, Guildford, England.", 
"California State University School of Nursing, Chico, USA.")

各文字列内には、場所を参照する部分文字列がある場合とない場合があり、それ自体が国を参照する場合があります。意図した出力は、次のようなデータフレームです。

Affiliation[1], matchedCountry
Affiliation[2], matchedCountry
...
Affiliation[n], matchedCountry

「matchedCountry」は、複数のリスト (大学、英国の都市、米国の州など) に基づいて評価されることを意図しており、NA が許可されています。また、一部のリストは ISO コードのみを返します。

これまでのフィードバック (@rbm に感謝) に基づいて、私は非常にうまく機能するソリューション (回答セクションを参照) を管理しました。そうは言っても、パフォーマンスはまだ改善される可能性があると確信しています。ありがとう。

参考文献:

  1. リスト内の複数の data.frame を同時にマージする
  2. R grepl: 複数の部分文字列に対して複数の文字列をすばやく照合し、すべての一致を返す
  3. Re grep: 1 つの文字列を複数のパターンと照合します
  4. ある列の行の値がデータ フレームの別の列内にあるかどうかを確認する R データ フレームの高速テスト
  5. リストに含まれるすべての文字列ではなく一部の文字列から複数のパターンを使用して複数の部分文字列を抽出して結合し、R のリストに戻す
  6. Rの文字列内の複数のリストから部分文字列を検出する方法
4

1 に答える 1

0

これは、マスターリストの各項目に対して部分文字列のさまざまなリストをチェックし、リストに応じて、a) 元の部分文字列、b) 隣接する部分文字列、または c) 固定/事前定義された値のいずれかを返すソリューションです。 . 結果は、「国」列が追加された元のテーブルです。

これらの条件は、提供されているサンプル コードに示されています。

編集:「ドメイン」ルックアップが意図したとおりに機能していないようです。トラブルシューティング/修正方法はよくわかりませんが、この質問の範囲を超えていると思います...

######### GENERATE COUNTRY ID  #############

  library("stringr")
  library(RCurl)

  ## Download country lists and perpetrate

  countryList <- getURL("https://raw.githubusercontent.com/umpirsky/country-list/master/country/icu/en_US/country.csv")
  usstatesList <- getURL("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")
  ukcitiesList <- getURL("https://raw.githubusercontent.com/encyclopediaio/list-of-cities-in-the-uk/master/src/uk_cities.csv")
  ukcountryList <- getURL("https://raw.githubusercontent.com/Gibbs/UK-Postcodes/master/postcodes.csv")
  universitiesList <- getURL("https://raw.githubusercontent.com/endSly/world-universities-csv/master/world-universities.csv")

  countryList <- read.csv(text = countryList, stringsAsFactors=FALSE)
  usstatesList <- read.csv(text = usstatesList, stringsAsFactors=FALSE)
  ukcitiesList <- read.csv(text = ukcitiesList, stringsAsFactors=FALSE)
  ukcountryList <- read.csv(text = ukcountryList, stringsAsFactors=FALSE)
  universitiesList <- read.csv(text = universitiesList, header = FALSE, stringsAsFactors=FALSE)

  ## Generate affiliation list from ronbun data
  affiliationList <- pub.data$Affiliation1

  ## Generate email domains column and add to countryList
  domains <- function(x)
    { 
    x <- tolower(x)
    x <- paste0(".", x)
    return(x)
    }

  countryList <- data.frame(countryList[c("name", "iso")], domain = domains(countryList$iso), stringsAsFactors = FALSE)


  ## Add country names to universitiesList as V4

  universitiesList <- data.frame(universitiesList, V4="", stringsAsFactors = FALSE)

  i = 0
  for (v in universitiesList$V1)
  {
    tryCatch({  
    i = i + 1
      if (sum(str_detect(v, countryList$iso)) > 0) {
        universitiesList$V4[i] <- countryList$name[which(str_detect(v, countryList$iso))]
      }
    }, error=function(e){})
  }

  ### on to the main show

  df <- data.frame(affiliationList, CountryISO="", CountryNAME="", stringsAsFactors = FALSE)


  i = 0
  for (v in affiliationList)
  {
    tryCatch({
    i = i + 1
      if (sum(str_detect(v, countryList$name)) > 0) {
        df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$name))]
        df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$name))]   
      } 
      if (sum(str_detect(v, ukcitiesList$name)) > 0) {
        df$CountryISO[i] <- "GB"
        df$CountryNAME[i] <- "United Kingdom"   
      }
      if (sum(str_detect(v, ukcountryList$country_string)) > 0) {
        df$CountryISO[i] <- "GB"
        df$CountryNAME[i] <- "United Kingdom"   
      } 
      if (sum(str_detect(v, usstatesList$State)) > 0 || sum(str_detect(v, usstatesList$Abbreviation)) > 0) {
        df$CountryISO[i] <- "US"
        df$CountryNAME[i] <- "United States"   
      } 
      if (sum(str_detect(v, countryList$domain)) > 0) {
        df$CountryISO[i] <- countryList$iso[which(str_detect(v, countryList$domain))]
        df$CountryNAME[i] <- countryList$name[which(str_detect(v, countryList$domain))]   
      } 
      if (sum(str_detect(v, universitiesList$V2)) > 0) {
        df$CountryISO[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V2))]
        df$CountryNAME[i] <- universitiesList$V1[which(str_detect(v, universitiesList$V4))]
      } 
    }, error=function(e){})
  }

return(df)

提供されたすべてのヘルプに感謝します!

于 2015-09-24T08:14:50.717 に答える