r – Assign similar company names to same group id

I merged to datasets (dt, dt2) based on the exact match between company names in order to assign group_id to the second dataset (dt2). Now, I would like to fill the empty values โ€‹โ€‹of group_id with the ones from the same company (different branches) but having different names.

I constructed the data using this merge code

dt2 <- merge(dt2,dt[,c("psn_name_PAT","group_id")],by = c("psn_name_PAT"),all.x = T,all.y = F)

DATA

                                      psn_name_PAT   n      name_std group_id
1                            CHICHIBU FUJI COMPANY   2 CHICHIBU FUJI       NA
2                                    FUJI CERAMICS   1 FUJI CERAMICS       NA
3                                    FUJI CHEMICAL   1          FUJI      606
4                            FUJI ELECTRIC COMPANY 439          FUJI      606
5   FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT   5          FUJI      606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT  12          FUJI      606

DESIRED Output:

                                      psn_name_PAT   n      name_std group_id
1                            CHICHIBU FUJI COMPANY   2 CHICHIBU FUJI      606
2                                    FUJI CERAMICS   1 FUJI CERAMICS      606
3                                    FUJI CHEMICAL   1          FUJI      606
4                            FUJI ELECTRIC COMPANY 439          FUJI      606
5   FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT   5          FUJI      606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT  12          FUJI      606

I tried the following code as a way to match ‘not-exact word’ matches from another dataset (dt2), but unfortunately it brings different group_id not the once from the right company (ie FUJI):

dt2 <- dt2 %>% 
  mutate(group_id=ifelse(grepl(paste0("\",fuz$name_std,"\b", collapse = "|"), name_std),fuz$group_id,NA))

Any idea on how I could assign the right company group_id (ie FUJI) to different branches names?

Thank you in advance for your help!


DATASET

dt

    structure(list(psn_name_PAT = c("FUJI CHEMICAL", "FUJI ELECTRIC COMPANY", 
"FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT", "FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT", 
"FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC SYSTEMS COMPANY", 
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI MACHINE MANUFACTURING COMPANY", 
"FUJITSU", "FUJITSU GENERAL", "FUJIMI", "FUJIFILM", "FUJIFILM ELECTRONIC MATERIALS U.S.A.", 
"FUJIFILM MANUFACTURING EUROPE", "FUJIKURA", "FUJI MACHINERY MFG. & ELECTRONICS COMPANY", 
"FUJI XEROX COMPANY", "FUJIKIN", "FUJIKOSHI MACHINERY CORPORATION", 
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "TOSHIBA AMERICA RESEARCH", 
"TOSHIBA CORPORATION", "TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY", 
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA SOLUTIONS CORPORATION"
), name_std = c("FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI", 
"FUJI", "FUJI", "FUJITSU", "FUJITSU", "FUJIMI", "FUJIFILM", "FUJIFILM", 
"FUJIFILM", "FUJIKURA", "FUJI MFG", "FUJI XEROX", "FUJIKIN", 
"FUJIKOSHI", "HONGFUJIN", "TOSHIBA", "TOSHIBA", "TOSHIBA", "TOSHIBA", 
"TOSHIBA", "TOSHIBA"), n = c(45L, 45L, 45L, 45L, 45L, 45L, 45L, 
45L, 6L, 6L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), group_id = c(606L, 606L, 606L, 606L, 606L, 606L, 
606L, 606L, 614L, 614L, 613L, 609L, 609L, 609L, 612L, 607L, 608L, 
610L, 611L, 705L, 1631L, 1631L, 1631L, 1631L, 1631L, 1631L)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -26L))

dt2

    structure(list(psn_name_PAT = c("CHICHIBU FUJI COMPANY", "FUJI CERAMICS", 
"FUJI CHEMICAL", "FUJI ELECTRIC COMPANY", "FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT", 
"FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT", 
"FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY", "FUJI ELECTRIC SYSTEMS COMPANY", 
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI ELECTROCHEMICAL COMPANY", 
"FUJI FILM MICRODEVICES COMPANY", "FUJI HEAVY IND", "FUJI MACHINE MANUFACTURING COMPANY", 
"FUJI MACHINERY MFG. & ELECTRONICS COMPANY", "FUJI SEIKI MACHINE WORKS", 
"FUJI XEROX COMPANY", "FUJIFILM", "FUJIFILM DIAMATIX", "FUJIFILM DIMATIX", 
"FUJIFILM ELECTRONIC IMAGING", "FUJIFILM ELECTRONIC MATERIALS U.S.A.", 
"FUJIFILM MANUFACTURING EUROPE", "FUJIKIN", "FUJIKOSHI KIKAI KOGYO", 
"FUJIKOSHI MACHINERY CORPORATION", "FUJIKURA", "FUJIMA", "FUJIMI", 
"FUJINON CORPORATION", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR", 
"FUJITSU AMD SEMICONDUCTOR LIMITED (FASL)", "FUJITSU AND SEMICONDUCTOR", 
"FUJITSU AUTOMATION", "FUJITSU COMPONENT", "FUJITSU DISPLAY TECHNOLOGIES CORPORATION", 
"FUJITSU FRONTECH", "FUJITSU GENERAL", "FUJITSU HITACHI PLASMA DISPLAY", 
"FUJITSU LIMITED KABUSHIKI KAISHA TOSHIBA", "FUJITSU MEDIA DEVICES", 
"FUJITSU MICROCOMPUTER SYSTEMS", "FUJITSU MICROELECTRONICS", 
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI ELECTRONICS", 
"FUJITSU QUANTUM DEVICES", "FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT", 
"FUJITSU TEN", "FUJITSU VLSI", "FUJITSU YAMANASHI ELECTRONICS", 
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "KYUSHU FUJITSU ELECTRONICS", 
"NEC TOSHIBA SPACE SYSTEMS", "SUZUKA FUJI XEROX COMPANY", "TOSHIBA AMERICA ELECTRONIC COMPONENTS", 
"TOSHIBA AMERICA RESEARCH", "TOSHIBA AUTOMATION COMPANY", "TOSHIBA AVE CORPORATION", 
"TOSHIBA CERAMICS COMPANY", "TOSHIBA COMPONENTS COMPANY", "TOSHIBA CORPORATION", 
"TOSHIBA KIKAI", "TOSHIBA LIGHTING & TECHNOLOGY CORPORATION", 
"TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY", "TOSHIBA MATSUSHITA DISPLAY TECHNOLOGY COMPANY", 
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA MICROELECTRONICS CORPORATION", 
"TOSHIBA MOBILE DISPLAY COMPANY", "TOSHIBA SHIBAURA DENKI", "TOSHIBA SILICONE COMPANY", 
"TOSHIBA SOLUTIONS CORPORATION", "TOSHIBA TEC CORPORATION", "TOSHIBA TECHNO CENTER"
), n = c(2L, 1L, 1L, 439L, 5L, 12L, 1L, 43L, 38L, 1L, 1L, 1L, 
1L, 2L, 2L, 4L, 129L, 549L, 1L, 5L, 1L, 1L, 2L, 5L, 1L, 1L, 35L, 
1L, 7L, 20L, 2430L, 7L, 1L, 2L, 1L, 2L, 4L, 2L, 1L, 1L, 1L, 36L, 
1L, 126L, 1L, 1L, 56L, 312L, 1L, 3L, 25L, 1L, 6L, 13L, 1L, 1L, 
14L, 1L, 1L, 1L, 19L, 1L, 5609L, 7L, 5L, 5L, 15L, 22L, 8L, 1L, 
2L, 1L, 1L, 1L, 3L, 4L), name_std = c("CHICHIBU FUJI", "FUJI CERAMICS", 
"FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI DEVICE", "FUJI", 
"FUJI", "FUJI ELECTROCHEMICAL", "FUJI MICRODEVICES", "FUJI HEAVY IND", 
"FUJI", "FUJI MFG", "FUJI SEIKI WORKS", "FUJI XEROX", "FUJIFILM", 
"FUJIFILM DIAMATIX", "FUJIFILM DIMATIX", "FUJIFILM IMAGING", 
"FUJIFILM", "FUJIFILM", "FUJIKIN", "FUJIKOSHI KIKAI", "FUJIKOSHI", 
"FUJIKURA", "FUJIMA", "FUJIMI", "FUJINON", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR", 
"FUJITSU AMD SEMICONDUCTOR FASL", "FUJITSU SEMICONDUCTOR", "FUJITSU AUTOMATION", 
"FUJITSU COMPONENT", "FUJITSU DISPLAY", "FUJITSU FRONTECH", "FUJITSU", 
"FUJITSU HITACHI PLASMA DISPLAY", "FUJITSU KABUSHIKI KAISHA TOSHIBA", 
"FUJITSU DEVICES", "FUJITSU MICROCOMPUTER", "FUJITSU MICROELECTRONICS", 
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI", "FUJITSU QUANTUM DEVICES", 
"FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT", "FUJITSU TEN", 
"FUJITSU VLSI", "FUJITSU YAMANASHI", "HONGFUJIN", "KYUSHU FUJITSU", 
"NEC TOSHIBA SPACE", "SUZUKA FUJI XEROX", "TOSHIBA COMPONENTS", 
"TOSHIBA", "TOSHIBA AUTOMATION", "TOSHIBA AVE", "TOSHIBA CERAMICS", 
"TOSHIBA COMPONENTS", "TOSHIBA", "TOSHIBA KIKAI", "TOSHIBA LIGHTING", 
"TOSHIBA", "TOSHIBA", "TOSHIBA MATSUSHITA DISPLAY", "TOSHIBA", 
"TOSHIBA MICROELECTRONICS", "TOSHIBA MOBILE DISPLAY", "TOSHIBA SHIBAURA DENKI", 
"TOSHIBA SILICONE", "TOSHIBA", "TOSHIBA TEC", "TOSHIBA TECHNO"
), group_id = c(NA, NA, 606L, 606L, 606L, 606L, 606L, NA, 606L, 
606L, NA, NA, NA, 606L, 607L, NA, 608L, 609L, NA, NA, NA, 609L, 
609L, 610L, NA, 611L, 612L, NA, 613L, NA, 614L, NA, NA, NA, NA, 
NA, NA, NA, 614L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 705L, NA, NA, NA, NA, 1631L, NA, NA, NA, NA, 1631L, NA, 
NA, 1631L, 1631L, NA, 1631L, NA, NA, NA, NA, 1631L, NA, NA)), row.names = c(NA, 
-76L), class = "data.frame")

Leave a Comment