使用for循环从R中的dfs列表中的dfs提取对角线

wz3gfoph  于 2023-05-04  发布在  其他
关注(0)|答案(2)|浏览(256)

我从八个调查地点A到H和九年(1999年,2000年,2001年,2006年,2008年,2011年,2013年,2016年,2019年)获得数据。

library (dplyr)
# create column names for years
years <- c(1999, 2000, 2001, 2006, 2008, 2011, 2013, 2016, 2019)

# create an empty data frame with sites A:H in the first column
df <- data.frame(Site = c("A", "B", "C", "D", "E", "F", "G", "H"),
                 matrix(NA, nrow = 8, ncol = length(years), 
                        dimnames = list(NULL, years)))

# remove the X before the year in the column names
colnames(df)[-1] <- years

# generate random numbers for each site and year
set.seed(123)  # for reproducibility
for (i in 1:nrow(df)) {
  df[i, 2:ncol(df)] <- runif(length(years))
}

我计算了调查地点和年份的欧氏距离,将此矩阵存储为 Dataframe ,并将其拆分为Site的 Dataframe 列表

# gather to long form to calculate dist.
df = df %>% gather (key = "Year", value = "Value", c(2:last_col()))

# calculate dist and set as a df
dist.df <- df %>% 
  mutate(YrSi = paste(substr(Year, 3, 4), Site)) %>%
  select(-Year, -Site) %>%
  column_to_rownames(var = "YrSi") %>%
  dist() %>%
  as.matrix() %>%
  as.data.frame()

#split dist.df to a list of dfs per site

dist.df.list = dist.df %>% 
  rownames_to_column("YrSi") %>% 
  separate(YrSi, c("Year", "Site"), sep = " ") %>% 
  mutate (Year =  as.numeric (ifelse (Year == "99", sprintf("19%s", Year),sprintf("20%s", Year)))) %>% # Change to yyyy
  gather(key = "YrSi", value = "Dist", c(3:last_col())) %>% 
  separate(YrSi, c("Year2", "Site2"), sep = " ") %>% 
  mutate (Year2 =  as.numeric (ifelse (Year2 == "99", sprintf("19%s", Year2),sprintf("20%s", Year2)))) %>% # Change to yyyy
  arrange(Site, Year, Site2, Year2) %>%
  spread (key = "Year2", value = "Dist") %>% 
  group_by(Site, Site2) %>% 
  subset (Site == Site2) %>%
  relocate (Year, .after = Site2) %>% 
  group_split()

我想创建一个新的数据名result.df,它将第一列显示为Site,然后每一列都是从2000年开始的调查年份(见下文)。在每一列下,将显示给定站点的调查年份与前一年调查之间的距离。例如:在2000年下,将显示每个地点1999年和2000年之间的距离;在2008年下,将显示该年与2006年之间的距离;等等。为了简化,我想提取对角线,如图中突出显示的:

为了使它更易于管理,我收集了一个长格式的数据,所以不是每年的一列,而是Year列和Year2列:

dist.df.list = dist.df %>% 
  rownames_to_column("YrSi") %>% 
  separate(YrSi, c("Year", "Site"), sep = " ") %>% 
  mutate (Year =  as.numeric (ifelse (Year == "99", sprintf("19%s", Year),sprintf("20%s", Year)))) %>% # Change to yyyy
  gather(key = "YrSi", value = "Dist", c(3:last_col())) %>% 
  separate(YrSi, c("Year2", "Site2"), sep = " ") %>% 
  mutate (Year2 =  as.numeric (ifelse (Year2 == "99", sprintf("19%s", Year2),sprintf("20%s", Year2)))) %>% # Change to yyyy
  arrange(Site, Year, Site2, Year2) %>%
  spread (key = "Year2", value = "Dist") %>% 
  group_by(Site, Site2) %>% 
  subset (Site == Site2) %>%
  relocate (Year, .after = Site2) %>% 
  gather (key="Year2", value = "dist", c(4:last_col())) %>%
  group_split()

现在生成result.df

# Initialize an empty data frame to store the results
result.df <- data.frame(Site = character(), stringsAsFactors = FALSE)

# Loop through each data frame in the list
for(i in 1:length(dist.df.list)) {
  # Extract the site name
  site <- dist.df.list[[i]]$Site[1]
  
  # Initialize a new row for the site in the result data frame
  new.row <- data.frame(Site = site, stringsAsFactors = FALSE)
  
  # Loop through each survey year and extract the distance between consecutive years
  for(j in c(2000, 2001, 2006,2008, 2011, 2013, 2016, 2019)) {
    col.name <- as.character(j)
    if(col.name %in% colnames(dist.df.list[[i]])) {
      # Extract the distance value from the test data frame
      dist <- dist.df.list[[i]] %>% 
        filter(Year2 == j) %>% 
        select(dist) %>% 
        pull()
      
      # If the distance value is missing, set it to NA
      if(is.na(dist)) {
        new.row[[col.name]] <- NA
      } else {
        # Otherwise, add the distance value to the new row
        new.row[[col.name]] <- dist
      }
    } else {
      # If the distance column doesn't exist, set the value to NA
      new.row[[col.name]] <- NA
    }
  }
  
  # Add the new row to the result data frame
  result.df <- rbind(result.df, new.row)
}
result.df

  Site 2000 2001 2006 2008 2011 2013 2016 2019
1    A   NA   NA   NA   NA   NA   NA   NA   NA
2    B   NA   NA   NA   NA   NA   NA   NA   NA
3    C   NA   NA   NA   NA   NA   NA   NA   NA
4    D   NA   NA   NA   NA   NA   NA   NA   NA
5    E   NA   NA   NA   NA   NA   NA   NA   NA
6    F   NA   NA   NA   NA   NA   NA   NA   NA
7    G   NA   NA   NA   NA   NA   NA   NA   NA
8    H   NA   NA   NA   NA   NA   NA   NA   NA

为什么我得到NA而不是距离?有没有更简单的方法来做到这一点?

s71maibg

s71maibg1#

看起来你是想把每年的值和下一年的值进行比较,棘手的是年份是非均匀分布的。处理这个问题的一种方法是将年份视为因子,然后将其转换为数字,因此2001年是year_index 3,2006年是year_index 4,等等。然后,我们可以将每行连接到具有相同站点和一个较低year_index的行。
这使得方法更短。
从第一个块中的初始df开始,然后再对其进行整形:

library(dplyr); library(tidyr)
df_long = df %>% 
  pivot_longer(-Site, names_to = "Year", values_to = "Value") %>%
  mutate(Year = as.factor(Year),
         Year_index = as.numeric(Year))
  
df_long %>%
  inner_join(df_long %>% mutate(Year_index = Year_index - 1), 
            by = join_by(Site, Year_index)) %>%
  transmute(Site, Year.x, diff = abs(Value.y - Value.x)) %>%
  pivot_wider(names_from = Year.x, values_from = diff)

结果(第一行与问题中突出显示的值匹配)

# A tibble: 8 × 9
  Site  `1999` `2000` `2001`  `2006` `2008`   `2011` `2013` `2016`
  <chr>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>  <dbl>  <dbl>
1 A     0.501  0.379  0.474  0.0574  0.895  0.483    0.364   0.341
2 B     0.500  0.503  0.224  0.105   0.470  0.797    0.654   0.204
3 C     0.627  0.0650 0.197  0.0523  0.354  0.339    0.0528  0.164
4 D     0.305  0.142  0.816  0.0607  0.212  0.105    0.771   0.453
5 E     0.542  0.102  0.0866 0.0888  0.272  0.000822 0.0449  0.216
6 F     0.0942 0.233  0.200  0.592   0.812  0.396    0.357   0.677
7 G     0.354  0.0790 0.626  0.142   0.521  0.291    0.570   0.289
8 H     0.540  0.366  0.362  0.00233 0.0180 0.355    0.315   0.125
bq3bfh9z

bq3bfh9z2#

base R中,我们可以通过从开始和结束处删除列来减去两个大小相等的数据集,从而轻松地完成此操作

cbind(df[1], abs(df[-1][-ncol(df[-1])] - df[-1][-1]))
  • 输出
Site       1999       2000       2001        2006       2008         2011       2013      2016
1    A 0.50072762 0.37932821 0.47404048 0.057449880 0.89491078 0.4825489887 0.36431356 0.3409840
2    B 0.50021861 0.50349919 0.22423648 0.104937233 0.46970872 0.7969002877 0.65373724 0.2040282
3    C 0.62658293 0.06496433 0.19673591 0.052296592 0.35376296 0.3385639775 0.05282467 0.1644644
4    D 0.30498228 0.14204609 0.81591059 0.060725187 0.21159377 0.1047621393 0.77085373 0.4531823
5    E 0.54205160 0.10177307 0.08655522 0.088825763 0.27174631 0.0008220095 0.04487888 0.2164007
6    F 0.09422804 0.23292835 0.19998981 0.591855075 0.81199655 0.3963689075 0.35672477 0.6770256
7    G 0.35441659 0.07899974 0.62577621 0.141737495 0.52058258 0.2906524187 0.57027453 0.2891290
8    H 0.54025639 0.36612370 0.36154801 0.002325156 0.01804719 0.3545106335 0.31464347 0.1252540

相关问题