1

I have two data frames with country-level data from two different sources. The data frames contain the country name and an amount of a certain variable (I just made up the amounts to avoid too long numbers). See an example of the data frames below. The problem is the country naming differs between the two sources and in some cases similar county names may differ by the spelling in the two sources (e.g. CentralAfricanRepublic in df1 is Central African Republic in df2). I would like to use df1 as a reference to identify the countries that are the same in both data frames using some kind of fuzzy matching then select only the data for the countries that appear in df1 from df2. Please note that df2 has more countries.

df1 <- data.frame(country=c("Argentina", "Bahamas", "Barbados", "Belize", "Bolivia", "Brazil", "BritishVirginIslands", "Chile", "Colombia", "CostaRica", "Cuba", 
                        "Dominica", "DominicanRepublic", "Ecuador", "ElSalvador", "FrenchGuiana", "Guadeloupe", "Guatemala", "Guyana", "Haiti", "Honduras", "Jamaica", 
                        "Martinique", "Mexico", "Nicaragua", "Panama", "Paraguay", "Peru","StLucia", "StVincent", "Suriname", "TrinidadandTobago", "Uruguay", 
                        "Venezuela", "Bangladesh", "Bhutan", "Brunei", "Cambodia", "India", "Indonesia", "Laos", "Malaysia", "Myanmar", "Nepal", "Pakistan", 
                        "Philippines", "PapuaNewGuinea", "Singapore", "SriLanka", "Thailand", "TimorLeste", "Vietnam", "Angola", "Benin", "Botswana", "BurkinaFaso", 
                        "Burundi", "Cameroon", "CentralAfricanRepublic", "Chad", "Congo","Djibouti", "DemocraticRepublicoftheCongo", "EquatorialGuinea", 
                        "Eritrea", "Ethiopia", "Gabon", "Gambia", "Ghana", "Guinea","GuineaBissau", "IvoryCoast", "Kenya", "Lesotho", "Liberia", 
                        "Madagascar", "Malawi", "Mauritania", "Mali", "Mozambique", "Namibia","Niger", "Nigeria", "Rwanda", "Sudan", "Senegal", "SierraLeone", 
                        "Somalia", "SouthAfrica", "SouthSudan", "Swaziland", "Tanzania", "Togo", "Uganda", "Zambia", "Zimbabwe", "Canada", "UnitedStatesofAmerica", 
                        "Albania", "Andorra", "Austria", "Belgium", "Bosnia", "Bulgaria", "Croatia", "CzechRepublic", "Denmark", "Finland", "France", "Germany", 
                        "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Liechtenstein","Luxembourg", "Macedonia", "Malta", "Montenegro", "Netherlands", 
                        "Norway", "Poland", "Portugal", "Romania", "Serbia", "Slovakia","Slovenia", "Spain", "Sweden", "Switzerland", "UnitedKingdom", 
                        "Afghanistan", "Algeria", "Bahrain", "Cyprus", "Egypt", "Iran","Iraq", "Israel", "Jordan", "Kuwait", "Lebanon", "Libya", "Morocco", 
                        "Oman", "Qatar", "SaudiArabia", "Syria", "Tunisia", "Turkey","UnitedArabEmirates", "WesternSahara", "Yemen", "Armenia", "Azerbaijan", 
                        "Belarus", "Estonia", "Georgia", "Kazakhstan", "Kyrgyzstan", "Latvia", "Lithuania", "Moldova", "Russia", "Tajikistan", "Turkmenistan", 
                        "Ukraine", "Uzbekistan", "Japan", "Mongolia", "NorthKorea", "SouthKorea","China", "Australia", "CookIslands", "Fiji", "FrenchPolynesia", 
                        "Micronesia", "NewCaledonia", "NewZealand", "Niue", "Samoa", "SolomonIslands", "Tonga", "Vanuatu")
              Amounts=c(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 
                        32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 
                        64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 
                        96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 
                        122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 
                        148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 
                        174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 
                        200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 
                        226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 
                        252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 
                        278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 
                        304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 
                        330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 
                        356, 358, 360, 362, 364, 366, 368, 370))

df2 <- data.frame(country=c("Afghanistan", "Albania", "Algeria", "American Samoa", "Andorra", 
                        "Angola", "Anguilla", "Antigua and Barbuda", "Argentina", "Armenia", 
                        "Aruba", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", 
                        "Bangladesh", "Barbados", "Belarus", "Belgium", "Belgium-Luxembourg", 
                        "Belize", "Benin", "Bermuda", "Bhutan", "Bolivia (Plurinational State of)", 
                        "Bosnia and Herzegovina", "Botswana", "Brazil", "British Virgin Islands", 
                        "Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", 
                        "Cambodia", "Cameroon", "Canada", "Cayman Islands", "Central African Republic", 
                        "Chad", "Channel Islands", "Chile", "China", "China, Hong Kong SAR", 
                        "China, Macao SAR", "China, mainland", "China, Taiwan Province of", 
                        "Colombia", "Comoros", "Congo", "Cook Islands", "Costa Rica", 
                        "Côte d'Ivoire", "Croatia", "Cuba", "Cyprus", "Czechia", "Czechoslovakia", 
                        "Democratic People's Republic of Korea", "Democratic Republic of the Congo", 
                        "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", 
                        "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", 
                        "Eswatini", "Ethiopia", "Ethiopia PDR", "Falkland Islands (Malvinas)", 
                        "Faroe Islands", "Fiji", "Finland", "France", "French Guyana", 
                        "French Polynesia", "Gabon", "Gambia", "Georgia", "Germany", 
                        "Ghana", "Gibraltar", "Greece", "Greenland", "Grenada", "Guadeloupe", 
                        "Guam", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", 
                        "Holy See", "Honduras", "Hungary", "Iceland", "India", "Indonesia", 
                        "Iran (Islamic Republic of)", "Iraq", "Ireland", "Isle of Man", 
                        "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", 
                        "Kenya", "Kiribati", "Kuwait", "Kyrgyzstan", "Lao People's Democratic Republic", 
                        "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", 
                        "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", 
                        "Maldives", "Mali", "Malta", "Marshall Islands", "Martinique", 
                        "Mauritania", "Mauritius", "Mayotte", "Mexico", "Micronesia (Federated States of)", 
                        "Monaco", "Mongolia", "Montenegro", "Montserrat", "Morocco", 
                        "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", 
                        "Netherlands Antilles (former)", "New Caledonia", "New Zealand", 
                        "Nicaragua", "Niger", "Nigeria", "Niue", "Norfolk Island", "North Macedonia", 
                        "Northern Mariana Islands", "Norway", "Oman", "Pacific Islands Trust Territory", 
                        "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", 
                        "Paraguay", "Peru", "Philippines", "Pitcairn", "Poland", "Portugal", 
                        "Puerto Rico", "Qatar", "Republic of Korea", "Republic of Moldova", 
                        "Réunion", "Romania", "Russian Federation", "Rwanda", "Saint Helena, Ascension and Tristan da Cunha", 
                        "Saint Kitts and Nevis", "Saint Lucia", "Saint Pierre and Miquelon", 
                        "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", 
                        "Saudi Arabia", "Senegal", "Serbia", "Serbia and Montenegro", 
                        "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", 
                        "Solomon Islands", "Somalia", "South Africa", "South Sudan", 
                        "Spain", "Sri Lanka", "Sudan", "Sudan (former)", "Suriname", 
                        "Svalbard and Jan Mayen Islands", "Sweden", "Switzerland", "Syrian Arab Republic", 
                        "Tajikistan", "Thailand", "Timor-Leste", "Togo", "Tokelau", "Tonga", 
                        "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Turks and Caicos Islands", 
                        "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom of Great Britain and Northern Ireland", 
                        "United Republic of Tanzania", "United States of America", "United States Virgin Islands", 
                        "Uruguay", "USSR", "Uzbekistan", "Vanuatu", "Venezuela (Bolivarian Republic of)", 
                        "Viet Nam", "Wallis and Futuna Islands", "Western Sahara", "Yemen", 
                        "Yugoslav SFR", "Zambia", "Zimbabwe"),
              Amounts=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 
                        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 
                        50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 
                        66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
                        82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 
                        98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 
                        111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 
                        124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 
                        137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 
                        150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 
                        163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 
                        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 
                        189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 
                        202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 
                        215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 
                        228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 
                        241, 242, 243)
            )

Does anyone have an idea the neatest way to achieve this?

1
  • Maybe agrep or stringdist::stringdist . Commented Jun 24, 2020 at 10:50

1 Answer 1

1

here is a fuzzyjoin-example... you will probably have to experiment with the various stringdist-methods and the function's arguments to get the results that suit your needs best.

library( fuzzyjoin )
stringdist_left_join( df1, df2, by = "country", method = "lcs", max_dist = 1 )

#                        country.x Amounts.x                country.y Amounts.y
# 1                      Argentina         0                Argentina         9
# 2                        Bahamas         2                  Bahamas        15
# 3                       Barbados         4                 Barbados        18
# 4                         Belize         6                   Belize        22
# 5                        Bolivia         8                     <NA>        NA
# 6                         Brazil        10                   Brazil        29
# 7           BritishVirginIslands        12   British Virgin Islands        30
# 8                          Chile        14                    Chile        43
# 9                       Colombia        16                 Colombia        49
# ....
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.