R:将数据单元拆分为片段

o7jaxewo  于 2023-03-20  发布在  其他
关注(0)|答案(3)|浏览(114)

我需要切掉字符串的前两个和最后一个字符。下面是一个数据单元格中的例子。我需要做的是获取前两个数字(12),然后是冒号(4)后面的最后一个数字,但“”和:事情很复杂。任何帮助都将不胜感激。非常感谢!

{
  "120710103043": 4,
  "160179505004": 10,
  "300930004002": 4,
  "301010002001": 4,
  "370190202042": 4,
  "410099702003": 4,
  "410499701003": 4,
  "410599509002": 4,
  "518100454051": 4,
  "530110406041": 4,
  "530150004005": 4,
  "530150005012": 4,
  "530250105001": 4,
  "530270006002": 4,
  "530270007001": 4,
  "530270012004": 4,
  "530330297001": 4,
  "530330303053": 4,
  "530330304013": 4,
  "530330304041": 4,
  "530330326011": 4,
  "530419704001": 4,
  "530419707002": 4,
  "530419708004": 4,
  "530419711002": 4,
  "530419712002": 4,
  "530419714001": 4,
  "530419718001": 8,
  "530459610002": 4,
  "530530611004": 4,
  "530530614001": 4,
  "530530628013": 4,
  "530530702062": 4,
  "530530703121": 4,
  "530530704012": 4,
  "530530704033": 4,
  "530530712061": 4,
  "530530712081": 4,
  "530530713062": 4,
  "530530714031": 4,
  "530530714032": 5,
  "530530714064": 5,
  "530530714071": 4,
  "530530714072": 4,
  "530530714081": 4,
  "530530714102": 4,
  "530530714103": 4,
  "530530714104": 4,
  "530530714111": 4,
  "530530714112": 4,
  "530530715061": 4,
  "530530715063": 4,
  "530530716024": 4,
  "530530717042": 4,
  "530530718052": 4,
  "530530718053": 4,
  "530530721123": 4,
  "530530728002": 4,
  "530530729011": 4,
  "530530729031": 6,
  "530530729051": 4,
  "530530729052": 4,
  "530530729053": 4,
  "530530729062": 4,
  "530530729072": 4,
  "530530729073": 4,
  "530530730011": 56,
  "530530730012": 36,
  "530530730013": 51,
  "530530730014": 44,
  "530530730015": 55,
  "530530730051": 4,
  "530530730052": 19,
  "530530730053": 27,
  "530530730054": 28,
  "530530730061": 88,
  "530530730062": 60,
  "530530731083": 4,
  "530530731113": 4,
  "530530731133": 5,
  "530530731142": 4,
  "530530731171": 4,
  "530530731172": 13,
  "530530731181": 6,
  "530530731182": 4,
  "530530731191": 4,
  "530530731192": 7,
  "530530731242": 5,
  "530530731251": 4,
  "530530732001": 12,
  "530530732002": 8,
  "530530732003": 14,
  "530530732004": 24,
  "530530733013": 8,
  "530530733014": 4,
  "530579515005": 4,
  "530579518001": 4,
  "530610410004": 4,
  "530610508005": 4,
  "530630118003": 4,
  "530670103002": 4,
  "530670107001": 4,
  "530670108004": 4,
  "530670108005": 5,
  "530670109101": 4,
  "530670112003": 4,
  "530670114101": 4,
  "530670114104": 4,
  "530670114201": 5,
  "530670114204": 4,
  "530670115002": 4,
  "530670115003": 7,
  "530670116104": 4,
  "530670116211": 4,
  "530670116213": 4,
  "530670116223": 4,
  "530670116232": 7,
  "530670116242": 4,
  "530670117102": 4,
  "530670117202": 4,
  "530670122124": 4,
  "530670122213": 4,
  "530670122221": 4,
  "530670123105": 4,
  "530670123201": 4,
  "530670123202": 12,
  "530670123301": 4,
  "530670124111": 121,
  "530670124112": 184,
  "530670124113": 143,
  "530670124121": 115,
  "530670124122": 151,
  "530670124201": 13,
  "530670124202": 14,
  "530670124203": 7,
  "530670125101": 135,
  "530670125201": 124,
  "530670125202": 59,
  "530670125203": 70,
  "530670125204": 41,
  "530670125301": 49,
  "530670125302": 34,
  "530670125303": 55,
  "530670126101": 4,
  "530670126103": 6,
  "530670126104": 14,
  "530670126105": 4,
  "530670126201": 4,
  "530670126203": 5,
  "530670126204": 11,
  "530670126205": 5,
  "530670127101": 4,
  "530670127202": 4,
  "530670127204": 6,
  "530670127303": 4,
  "530770020022": 4
}

str_string_fixeddf%>%separate等给了我错误信息。

dddzy1tm

dddzy1tm1#

我们可以先提取数字,然后抓取所需的位置:

df <- tibble(x = c(
  "120710103043: 4",
  "160179505004: 10",
  "300930004002: 4",
  "301010002001: 4"
))

library(stringr)
library(dplyr)

df %>% 
  mutate(x = str_extract(data, "\\d+"), 
         first_two = as.integer(str_sub(numbers, 1, 2)),
         last_one = as.integer(str_sub(str_extract(data, ": \\d+"), 3))) %>% 
  select(-x)
first_two last_one
      <int>    <int>
1        12        4
2        16       10
3        30        4
4        30        4
50pmv0ei

50pmv0ei2#

看起来像JSON。

library(jsonlite)
library(dplyr)

jsonstr <- r"({
  "120710103043": 4,
  "160179505004": 10,
  "300930004002": 4,
  "301010002001": 4,
  "370190202042": 4,
  "410099702003": 4,
  "410499701003": 4,
  "410599509002": 4,
  "518100454051": 4,
  "530110406041": 4
})"

parse_json(jsonstr) %>% 
  tibble::enframe() %>% 
  mutate(name2 = substr(name, 1,2), .after = 1, 
         value = unlist(value)) 
#> # A tibble: 10 × 3
#>    name         name2 value
#>    <chr>        <chr> <int>
#>  1 120710103043 12        4
#>  2 160179505004 16       10
#>  3 300930004002 30        4
#>  4 301010002001 30        4
#>  5 370190202042 37        4
#>  6 410099702003 41        4
#>  7 410499701003 41        4
#>  8 410599509002 41        4
#>  9 518100454051 51        4
#> 10 530110406041 53        4

创建于2023年3月12日,使用reprex v2.0.2

4jb9z9bj

4jb9z9bj3#

extract的选项

library(tidyr)
extract(df, x, into = c("first_two", "last_two"), "^(..).*(..)$", convert = TRUE)
  • 输出
# A tibble: 4 × 2
  first_two last_two
      <int>    <int>
1        12        4
2        16       10
3        30        4
4        30        4

如果是JSON字符串

library(jsonlite)
 fromJSON(jsonstr) |> 
  stack() |> 
  setNames(c("last_two", "ind")) |>
  transform(first_two = substring(ind, 1, 2)) |> 
  subset(select = c(first_two, last_two)) |>
  type.convert(as.is = TRUE)
  • 输出
first_two last_two
1         12        4
2         16       10
3         30        4
4         30        4
5         37        4
6         41        4
7         41        4
8         41        4
9         51        4
10        53        4

相关问题