identify multiple spellings in R
dta <- cities %>% group_by(city) %>% count() %>%
ungroup() %>% mutate(i = row_number())
identify multiple spellings in R
dta <- cities %>% group_by(city) %>% count() %>%
ungroup() %>% mutate(i = row_number())
identify multiple spellings in R
library(dplyr)
library(stringdist)
identify multiple spellings in R
pairs <- expand.grid(x = seq_len(nrow(dta)), y = seq_len(nrow(dta))) %>%
# Only need to compare i to all records j, with j > i
filter(y > x) %>%
left_join(dta, by = c(x = 'i')) %>% rename(cityx = city, nx = n) %>%
left_join(dta, by = c(y = 'i')) %>% rename(cityy = city, ny = n) %>%
mutate(similarity = stringsim(cityx, cityy, method = "jw")) %>%
arrange(desc(similarity))
identify multiple spellings in R
cities <- data.frame(city = c('bangalore','bengaluru','banglore',
'bangalore', 'bangalore', 'bangalore', 'new york', 'newyork',
'nyork', 'new york', 'new york'))
identify multiple spellings in R
> pairs
# x y cityx nx cityy ny similarity
# 1 1 2 bangalore 4 banglore 1 0.9629630
# 2 4 5 new york 3 newyork 1 0.9583333
# 3 5 6 newyork 1 nyork 1 0.9047619
# 4 4 6 new york 3 nyork 1 0.8750000
# 5 2 3 banglore 1 bengaluru 1 0.7222222
# 6 1 3 bangalore 4 bengaluru 1 0.6944444
# 7 2 6 banglore 1 nyork 1 0.6583333
# 8 2 5 banglore 1 newyork 1 0.6011905
# 9 1 5 bangalore 4 newyork 1 0.5873016
# 10 2 4 banglore 1 new york 3 0.5833333
# 11 1 4 bangalore 4 new york 3 0.5694444
# 12 3 5 bengaluru 1 newyork 1 0.4761905
# 13 3 4 bengaluru 1 new york 3 0.4583333
# 14 1 6 bangalore 4 nyork 1 0.4370370
# 15 3 6 bengaluru 1 nyork 1 0.4370370
Copyright © 2021 Codeinu
Forgot your account's password or having trouble logging into your Account? Don't worry, we'll help you to get back your account. Enter your email address and we'll send you a recovery link to reset your password. If you are experiencing problems resetting your password contact us