R语言 1个字母的氨基酸变体变为3个字母

628mspwn  于 2023-05-04  发布在  其他
关注(0)|答案(4)|浏览(116)

我正试图取代一个字母机管局的变种,以3个字母的代码(更容易阅读)。一切工作完美,但很少错误。下面是我的代码和注解。谢谢你

x <- c("p.G12C","p.F121S","p.P124S","p.P124L","p.E13D",
        "p.E203K","p.Q209P","p.Q209P","p.Q209L")

aa3 <- c("Ala", "Arg", "Asn", "Asp", "Cys", "Glu", "Gln", "Gly", "His",
           "Ile", "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp",
           "Tyr", "Val")
aa1 <- c("A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K",
           "M", "F", "P", "S", "T", "W", "Y", "V")

for (i in 1:length(aa1))
{
  xy <- gsub(aa1[i],aa3[i],x,ignore.case = F)
}

输出

# Note that E, F and Q have unusual 3 letter replacement. 
  I could not figure out what is causing this.  
xy
[1] "p.Gly12Cys"    "p.Prohe121Ser" "p.Pro124Ser"   "p.Pro124Leu" 
    "p.Glylu13Asp"  "p.Glylu203Lys" "p.Glyln209Pro" "p.Glyln209Pro" "p.Glyln209Leu"

预期产量

"p.Gly12Cys"    "p.Phe121Ser" "p.Pro124Ser"   "p.Pro124Leu"   "p.Glu13Asp" 
"p.Glu203Lys" "p.Gln209Pro" "p.Gln209Pro" "p.Gln209Leu"

错误

outputs "p.Prohe121Ser"instead of "p.Phe121Ser" 
"p.Glylu13Asp" instead of  "p.Glu13Asp"
dw1jzc5e

dw1jzc5e1#

library(stringr)
str_replace_all(x, 
            c(
              "A"="Ala", "R"="Arg", "N"="Asn", "D"="Asp",
              "C"="Cys", "E"="Glu", "Q"="Gln", "G"="Gly",
              "H"="His", "I"="Ile", "L"="Leu", "K"="Lys",
              "M"="Met", "F"="Phe", "P"="Pro", "S"="Ser",
              "T"="Thr", "W"="Trp", "Y"="Tyr", "V"="Val"
              )    
            )
a6b3iqyw

a6b3iqyw2#

我们可以使用mgsub

library(qdap)
mgsub(aa1, aa3, x)
#[1] "p.Gly12Cys"    "p.Phe121Ser"   "p.Pro124Ser"   "p.Pro124Leu"  
#[5] "p.Glu13Alasp"  "p.Glu203Leuys" "p.Gln209Pro"   "p.Gln209Pro"  
#[9] "p.Gln209Leu"

更新

d1 <- read.csv(text=sub('(..)(.)(\\d+)(.)', '\\1,\\2,\\3,\\4', x), 
          header=FALSE, stringsAsFactors=FALSE)
 d1[c(2,4)] <- lapply(d1[,c(2,4)], function(x) aa3[match(x, aa1)])
 do.call(paste0, d1)
 #[1] "p.Gly12Cys"  "p.Phe121Ser" "p.Pro124Ser" "p.Pro124Leu" "p.Glu13Asp" 
 #[6] "p.Glu203Lys" "p.Gln209Pro" "p.Gln209Pro" "p.Gln209Leu"

或者使用gsubfn

library(gsubfn)
gsubfn('[A-Z]', setNames(as.list(aa3), aa1), x)
#[1] "p.Gly12Cys"  "p.Phe121Ser" "p.Pro124Ser" "p.Pro124Leu" "p.Glu13Asp" 
#[6] "p.Glu203Lys" "p.Gln209Pro" "p.Gln209Pro" "p.Gln209Leu"
eblbsuwk

eblbsuwk3#

下面是一个基本R解决方案:

ref <- aa3
names(ref) <- aa1
tmp <- do.call(rbind, regmatches(x, regexec("p\\.([A-Z])([0-9]+)([A-Z])", x)))
tmp2 <- apply(tmp[, c(2, 4)], 2, FUN = function(x) ref[x])
paste0("p.", tmp2[, 1], tmp[, 3], tmp2[, 2])
#[1] "p.Gly12Cys"  "p.Phe121Ser" "p.Pro124Ser" "p.Pro124Leu" "p.Glu13Asp"  "p.Glu203Lys" "p.Gln209Pro" "p.Gln209Pro" "p.Gln209Leu"

你基本上把你的字符串分成组成部分,例如。"p.Q209L"分为p.Q209L。然后,使用参考向量将氨基酸的一个字母表示与它们的3个字母版本交换,或者使用Akrun的方法,可以去掉ref[x](和两个额外的行!),并使用aa3[match(x, aa1)]代替。然后把东西粘回去

yb3bgrhw

yb3bgrhw4#

library(stringr)
library(tidyverse)

repx1 <- function(x)
{
    str_replace_all(x, c("A"="Ala", "C"="Cys", "G"="Gly","H"="His", "I"="Ile", "L"="Leu", "M"="Met",  "P"="Pro", "S"="Ser","T"="Thr", "V"="Val"))
}

repx2 <- function(x)
{
    str_replace_all(x, c("R"="Arg","N"="Asn", "D"="Asp","E"="Glu", "Q"="Gln", "K"="Lys","F"="Phe","W"="Trp", "Y"="Tyr"))
}

'p.Q307R' %>% repx2 %>% repx2

相关问题