我试图用六边形总结单细胞测序数据的UMAP散点图。由于目标是简化非常繁忙的聚类结果,我为每个bin混合颜色(=六边形),换句话说,如果有2个小区来自簇1,8个小区来自簇2,我将分配给这些簇的颜色按单元格的比例混合,这意味着我需要为每个六边形分配一种特定的颜色。
请原谅代码太长,我已经尽可能地缩短了。
library(hexbin)
library(ggplot2)
library(tibble)
####################
# helper functions #
####################
#' Determines majority in a vector
#' @description
#' Changed version of mclust::majorityVote. Ties are broken randomly.
#'
#' @param x a vector
#'
#' @returns
#' A single element of x that has the highest count.
#'
get_majority <- function(x){
x <- as.vector(x)
tally <- table(x)
max_idx <- seq_along(tally)[tally == max(tally, na.rm = TRUE)]
if(length(max_idx) > 1){
max_idx <- sample(max_idx, size = 1)
}
majority <- names(tally)[max_idx]
return(majority)
}
###################
# Toy data
umap_coords <- tibble( x = rnorm(1000),
y = rnorm(1000),
cluster = rep(c(1,2,3,4,5), 200))
colors <- c("#8DD3C7",
"#FFFFB3",
"#BEBADA",
"#FB8072",
"#80B1D3")
names(colors) <- 1:5
hexb <- hexbin::hexbin(umap_coords$x,
umap_coords$y,
xbins = 10,
xbnds = c(min(umap_coords$x),
max(umap_coords$x)),
ybnds = c(min(umap_coords$y),
max(umap_coords$y)),
IDs = TRUE)
gghex <- data.frame(hexbin::hcell2xy(hexb),
count = hexb@count,
cell = hexb@cell,
xo = hexb@xcm,
yo = hexb@ycm,
hexclust = NA)
for (i in seq_along(gghex$cell)){
cell_id <- gghex$cell[i]
hcnt <- gghex$count[i]
orig_id <- which(hexb@cID == cell_id)
umap_coords[orig_id,"hexbin"] <- cell_id
gghex$hexclust[i] <- get_majority(umap_coords[orig_id, "cluster"])
}
hex_colors <- vector(mode = "character", length = length(gghex$cell))
# For simplicity, here I assign a fixed color per cluster.
for (n in seq_along(gghex$cell)){
hex_colors[n] <- colors[names(colors) == gghex$hexclust[n]]
}
gghex$colors <- hex_colors
# I define the data in the geom because I combine it with a scatterplot from a different data.frame.
# (scatter plot is not relevatn for the question though.)
p <- ggplot2::ggplot() +
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y),
fill = gghex$colors,
alpha = 0.8,
stat = "identity")
p
然而,结果图显然没有将颜色分配给正确的六边形。如果我将聚类分配给aes()
,我会得到不同的图片:
ggplot2::ggplot() +
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = hexclust),
alpha = 0.8,
stat = "identity")
p
现在,对于这个特殊的玩具问题,我可以通过scale_fill_manual
指定颜色:
names(hex_colors) <- gghex$hexclust
ggplot2::ggplot() +
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = hexclust),
alpha = 0.8,
stat = "identity") +
scale_fill_manual(values = hex_colors, guide = "none")
但是请记住,在我的实际问题中,我必须为 * 每个 * 六边形指定一个特定的颜色,而这里geom_hex似乎崩溃了:
names(hex_colors) <- as.character(gghex$cell)
ggplot2::ggplot() +
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = as.character(cell)),
alpha = 0.8,
stat = "identity") +
scale_fill_manual(values = hex_colors, guide = "none")
p
正如你所看到的,六边形的大小突然完全错误了。我读到哈德利的一个简短建议,在aes
中使用group = 1
来使六边形相互感知,但这对我也不起作用。
关于如何使用geom_hex获得工作图,有人有什么建议吗?
多谢了!
- 编辑:**@Allen Cameron的答案解决了最初提出的问题,如果编辑没有最终答案,我会将其标记为解决方案。
但是,我发现如果我实际上为数据指定唯一的颜色,geom_hex会再次生成不同大小的六边形:
library(hexbin)
library(ggplot2)
library(tibble)
####################
# helper functions #
####################
#' Determines majority in a vector
#' @description
#' Changed version of mclust::majorityVote. Ties are broken randomly.
#'
#' @param x a vector
#'
#' @returns
#' A single element of x that has the highest count.
#'
get_majority <- function(x){
x <- as.vector(x)
tally <- table(x)
max_idx <- seq_along(tally)[tally == max(tally, na.rm = TRUE)]
if(length(max_idx) > 1){
max_idx <- sample(max_idx, size = 1)
}
majority <- names(tally)[max_idx]
return(majority)
}
#' Mixes the colors of two clusters proportionally.
#'
#' @param df data.frame of cells with clusters in `color_by` and assigned
#' hex bin in `hexbin`.
#' @param colors colors to be mixed.
#' @param cell Which hexbin to mix colors in.
#' @param color_by Column name where the clusters/groups are stored in `df`.
#'
#' @returns
#' Mixed color as hex code.
#'
mix_rgb <- function(df, colors, cell, color_by){
rgbcols <- col2rgb(colors)
sel <- which(df$hexbin == cell)
n_clust <- dplyr::pull(df[sel,color_by])
n_clust <- table(as.character(n_clust))
prop <- as.numeric(n_clust)
names(prop) <- names(n_clust)
prop <- prop/sum(prop)
rgb_new <- sweep(rgbcols[,names(prop), drop=FALSE], MARGIN =2, FUN = "*", prop)
rgb_new <- rowSums(rgb_new)
rgb_new <- rgb(red = rgb_new["red"],
green = rgb_new["green"],
blue = rgb_new["blue"],
maxColorValue = 255)
return(rgb_new)
}
###################
umap_coords <- tibble( x = rnorm(1000),
y = rnorm(1000),
cluster = rep(c(1,2,3,4,5), 200))
colors <- c("#8DD3C7",
"#FFFFB3",
"#BEBADA",
"#FB8072",
"#80B1D3")
names(colors) <- 1:5
hexb <- hexbin::hexbin(umap_coords$x,
umap_coords$y,
xbins = 10,
xbnds = c(min(umap_coords$x),
max(umap_coords$x)),
ybnds = c(min(umap_coords$y),
max(umap_coords$y)),
IDs = TRUE)
gghex <- data.frame(hexbin::hcell2xy(hexb),
count = hexb@count,
cell = hexb@cell,
xo = hexb@xcm,
yo = hexb@ycm,
hexclust = NA)
for (i in seq_along(gghex$cell)){
cell_id <- gghex$cell[i]
hcnt <- gghex$count[i]
orig_id <- which(hexb@cID == cell_id)
umap_coords[orig_id,"hexbin"] <- cell_id
gghex$hexclust[i] <- get_majority(umap_coords[orig_id, "cluster"])
}
hex_colors <- vector(mode = "character", length = length(gghex$cell))
for (n in seq_along(gghex$cell)){
hex_colors[n] <- mix_rgb(umap_coords,
colors = colors,
cell = gghex$cell[n],
color_by = "cluster")
}
gghex$colors <- hex_colors
ggplot2::ggplot() +
ggplot2::geom_hex(data = gghex,
mapping = ggplot2::aes(x = x,
y = y,
fill = colors),
alpha = 0.8,
stat = "identity") +
scale_fill_identity()
结果图如下所示:
1条答案
按热度按时间tcbh2hod1#
如果要根据
color
列填充每个六边形,可使用scale_fill_identity
:我们可以看到,所有的颜色都是所需的颜色,并且通过将它们的聚类和颜色值作为字符串添加到六边形上来匹配指定的聚类:
对于数据的编辑版本,这是需要
group = 1
的地方: