如何删除每个组和一个变量值后1步,并在R中创建直方图

njthzxwz  于 2023-04-27  发布在  其他
关注(0)|答案(1)|浏览(111)

我有数据集(例如部分)

ndvi=structure(list(district_id = c(72L, 58L, 72L, 58L, 72L, 58L, 
58L, 72L, 72L, 72L, 72L, 72L, 72L, 58L, 72L, 58L, 72L, 58L, 72L, 
58L, 72L), gfid = c(73099L, 59055L, 73104L, 59067L, 73008L, 59006L, 
59111L, 72315L, 72263L, 73174L, 72315L, 72263L, 73104L, 59012L, 
73099L, 59058L, 73099L, 59060L, 73104L, 59127L, 72315L), lon = c("75.98381", 
"76.6595", "76.029083", "76.662102", "76.016747", "76.691063", 
"76.690277", "76.264481", "76.263268", "75.990143", "76.264481", 
"76.263268", "76.029083", "76.636169", "75.98381", "76.692299", 
"75.98381", "76.699669", "76.029083", "76.628418", "76.264481"
), lat = c("23.019079", "28.881411", "22.873091", "29.01403", 
"23.15546", "28.88661", "28.88784", "23.218969", "23.14661", 
"22.99415", "23.218969", "23.14661", "22.873091", "29.00535", 
"23.019079", "28.88608", "23.019079", "28.879801", "22.873091", 
"28.99098", "23.218969"), date = c("08.02.2021", "05.03.2021", 
"08.02.2021", "10.03.2021", "03.02.2021", "05.03.2021", "05.03.2021", 
"08.02.2021", "08.02.2021", "19.01.2021", "23.02.2021", "23.02.2021", 
"13.02.2021", "10.03.2021", "29.01.2021", "05.03.2021", "23.02.2021", 
"05.03.2021", "23.02.2021", "10.03.2021", "13.02.2021"), red = c(32L, 
40L, 44L, 58L, 32L, 70L, 66L, 76L, 68L, 76L, 76L, 76L, 76L, 84L, 
108L, 94L, 90L, 100L, 86L, 108L, 94L), green = c(184L, 216L, 
148L, 202L, 104L, 350L, 234L, 260L, 220L, 276L, 220L, 220L, 188L, 
196L, 340L, 310L, 226L, 436L, 166L, 316L, 246L), blue = c(1L, 
8L, 1L, 2L, 1L, 78L, 26L, 60L, 52L, 28L, 44L, 76L, 44L, 1L, 92L, 
70L, 82L, 100L, 46L, 12L, 38L), nir = c(4872L, 4008L, 4328L, 
4424L, 2332L, 4856L, 4216L, 4696L, 4200L, 4328L, 4248L, 4040L, 
3816L, 4072L, 5176L, 4456L, 4232L, 4680L, 4008L, 5000L, 4344L
), swir = c(1120L, 1184L, 1056L, 1376L, 592L, 1584L, 1200L, 1120L, 
1072L, 1136L, 928L, 992L, 1056L, 1440L, 1248L, 1296L, 1040L, 
1760L, 928L, 1696L, 1072L), B05 = c(352L, 416L, 288L, 544L, 272L, 
672L, 480L, 480L, 416L, 464L, 352L, 416L, 336L, 560L, 544L, 480L, 
416L, 928L, 288L, 672L, 416L), B06 = c(2912L, 2528L, 2592L, 3040L, 
1488L, 3296L, 2720L, 3104L, 2672L, 2976L, 2464L, 2352L, 2272L, 
3088L, 3296L, 2912L, 2416L, 3168L, 2208L, 3568L, 2720L), B07 = c(4640L, 
3872L, 4192L, 4448L, 2128L, 4768L, 4000L, 4640L, 3984L, 4336L, 
4064L, 3872L, 3808L, 4448L, 4768L, 4256L, 3920L, 3936L, 3808L, 
4960L, 4384L), B08A = c(4752L, 3936L, 4384L, 4576L, 2192L, 4832L, 
4064L, 4704L, 4128L, 4400L, 4320L, 4064L, 4000L, 4448L, 4960L, 
4256L, 4192L, 4176L, 4064L, 5024L, 4512L), B12 = c(464L, 544L, 
416L, 608L, 240L, 736L, 544L, 496L, 480L, 464L, 416L, 480L, 480L, 
672L, 528L, 608L, 416L, 928L, 416L, 800L, 480L), ndvi = c(0.986949429, 
0.980237154, 0.979871912, 0.974118697, 0.972927241, 0.971579374, 
0.969173283, 0.968147527, 0.968134957, 0.965485921, 0.964847363, 
0.963070942, 0.960945529, 0.959576515, 0.959121877, 0.958681318, 
0.958352614, 0.958158995, 0.957987298, 0.95771339, 0.957638575
)), class = "data.frame", row.names = c(NA, -21L))

对于ndvi的任何一个值如果大于0.8如何一步删除值我来澄清一下这个问题我们以区= 72为例,第一行ndvi〉0.8且= 0.98,那么下一个值必须删除,即我们删除0.97987191.下面的值我们不碰,但我们也删除4.即删除经过1个步骤。值仍然是1-3-5,但2-4被依次删除,直到每个地区(组变量)分别结束。好的,输出将是这样的

district_id  gfid       lon       lat       date red green blue  nir swir B05  B06  B07 B08A B12
1          72 73099  75.98381 23.019079 08.02.2021  32   184    1 4872 1120 352 2912 4640 4752 464
2          72 73104 76.029083 22.873091 08.02.2021  44   148    1 4328 1056 288 2592 4192 4384 416
3          72 73008 76.016747  23.15546 03.02.2021  32   104    1 2332  592 272 1488 2128 2192 240
4          72 72315 76.264481 23.218969 08.02.2021  76   260   60 4696 1120 480 3104 4640 4704 496
5          72 72263 76.263268  23.14661 08.02.2021  68   220   52 4200 1072 416 2672 3984 4128 480
6          72 73174 75.990143  22.99415 19.01.2021  76   276   28 4328 1136 464 2976 4336 4400 464
       ndvi
1 0.9869494
2        NA
3 0.9729272
4        NA
5 0.9681350
6        NA

在我们得到所需的输出后,我们需要用2个点之间的平均值替换得到的间隙。将1和3个点的平均值插入第二个点,以此类推。将3和5个点的平均值插入第4个点。
最后,我需要每个district_id在我们删除值之前和恢复删除值之后的NDVI直方图。
什么是最简单的方法,以消除后的ndvi值1步,然后恢复平均为每个地区分别,也得到直方图之前和之后的价值观为每个地区的删除。
谢谢你的帮助。

s6fujrry

s6fujrry1#

我认为这是一个组合的减少(例如,base::Reducepurrr::reduce)和“聚结”(或滑动窗口平均)。我说“减少”是因为你所有的数据都超过了0.8,但你不想要相邻的值(在同一个district_id中都要转换为NA),我们需要考虑是否在考虑置零这个值之前将前一个值置零。
我将使用dplyr进行演示,因为它使分组和操作更容易可视化。这可以转换为base R(或同样容易的data.table)。
首先,整个解决方案,取代了ndvi

library(dplyr)
ndvi %>%
  group_by(district_id) %>%
  mutate(
    ndvi = Reduce(function(prev, this) if (is.na(prev) | prev <= 0.8) this else this[NA],
                  ndvi, accumulate = TRUE),
    ndvi = coalesce(ndvi, (lag(ndvi) + lead(ndvi))/2)
  )

每个步骤的演练:

library(dplyr)
ndvi %>%
  group_by(district_id) %>%
  mutate(ndvi2 = Reduce(function(prev, this) if (is.na(prev) | prev <= 0.8) this else this[NA], ndvi, accumulate = TRUE)) %>%
  filter(district_id == 72)
# # A tibble: 13 × 17
# # Groups:   district_id [1]
#    district_id  gfid lon       lat    date    red green  blue   nir  swir   B05   B06   B07  B08A   B12  ndvi  ndvi2
#          <int> <int> <chr>     <chr>  <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl>  <dbl>
#  1          72 73099 75.98381  23.01… 08.0…    32   184     1  4872  1120   352  2912  4640  4752   464 0.987  0.987
#  2          72 73104 76.029083 22.87… 08.0…    44   148     1  4328  1056   288  2592  4192  4384   416 0.980 NA    
#  3          72 73008 76.016747 23.15… 03.0…    32   104     1  2332   592   272  1488  2128  2192   240 0.973  0.973
#  4          72 72315 76.264481 23.21… 08.0…    76   260    60  4696  1120   480  3104  4640  4704   496 0.968 NA    
#  5          72 72263 76.263268 23.14… 08.0…    68   220    52  4200  1072   416  2672  3984  4128   480 0.968  0.968
#  6          72 73174 75.990143 22.99… 19.0…    76   276    28  4328  1136   464  2976  4336  4400   464 0.965 NA    
#  7          72 72315 76.264481 23.21… 23.0…    76   220    44  4248   928   352  2464  4064  4320   416 0.965  0.965
#  8          72 72263 76.263268 23.14… 23.0…    76   220    76  4040   992   416  2352  3872  4064   480 0.963 NA    
#  9          72 73104 76.029083 22.87… 13.0…    76   188    44  3816  1056   336  2272  3808  4000   480 0.961  0.961
# 10          72 73099 75.98381  23.01… 29.0…   108   340    92  5176  1248   544  3296  4768  4960   528 0.959 NA    
# 11          72 73099 75.98381  23.01… 23.0…    90   226    82  4232  1040   416  2416  3920  4192   416 0.958  0.958
# 12          72 73104 76.029083 22.87… 23.0…    86   166    46  4008   928   288  2208  3808  4064   416 0.958 NA    
# 13          72 72315 76.264481 23.21… 13.0…    94   246    38  4344  1072   416  2720  4384  4512   480 0.958  0.958

这验证了我们在ndvi2中每隔一个值为空(因为所有值都超过0.8)。
对于求平均值,只要确定不存在其他NA值,我们就可以做一个简单的(lag()+lead())/2求平均值(如果它确实存在,你可以移动到zoo::rollmean或类似的真正的滑动窗口平均值,并辅以if_else)。

ndvi %>%
  group_by(district_id) %>%
  mutate(ndvi2 = Reduce(function(prev, this) if (is.na(prev) | prev <= 0.8) this else this[NA], ndvi, accumulate = TRUE), ndvi3 = coalesce(ndvi2, (lag(ndvi2) + lead(ndvi2))/2)) %>%
  filter(district_id == 72)
# # A tibble: 13 × 18
# # Groups:   district_id [1]
#    district…¹  gfid lon   lat   date    red green  blue   nir  swir   B05   B06   B07  B08A   B12  ndvi  ndvi2 ndvi3
#         <int> <int> <chr> <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl>  <dbl> <dbl>
#  1         72 73099 75.9… 23.0… 08.0…    32   184     1  4872  1120   352  2912  4640  4752   464 0.987  0.987 0.987
#  2         72 73104 76.0… 22.8… 08.0…    44   148     1  4328  1056   288  2592  4192  4384   416 0.980 NA     0.980
#  3         72 73008 76.0… 23.1… 03.0…    32   104     1  2332   592   272  1488  2128  2192   240 0.973  0.973 0.973
#  4         72 72315 76.2… 23.2… 08.0…    76   260    60  4696  1120   480  3104  4640  4704   496 0.968 NA     0.971
#  5         72 72263 76.2… 23.1… 08.0…    68   220    52  4200  1072   416  2672  3984  4128   480 0.968  0.968 0.968
#  6         72 73174 75.9… 22.9… 19.0…    76   276    28  4328  1136   464  2976  4336  4400   464 0.965 NA     0.966
#  7         72 72315 76.2… 23.2… 23.0…    76   220    44  4248   928   352  2464  4064  4320   416 0.965  0.965 0.965
#  8         72 72263 76.2… 23.1… 23.0…    76   220    76  4040   992   416  2352  3872  4064   480 0.963 NA     0.963
#  9         72 73104 76.0… 22.8… 13.0…    76   188    44  3816  1056   336  2272  3808  4000   480 0.961  0.961 0.961
# 10         72 73099 75.9… 23.0… 29.0…   108   340    92  5176  1248   544  3296  4768  4960   528 0.959 NA     0.960
# 11         72 73099 75.9… 23.0… 23.0…    90   226    82  4232  1040   416  2416  3920  4192   416 0.958  0.958 0.958
# 12         72 73104 76.0… 22.8… 23.0…    86   166    46  4008   928   288  2208  3808  4064   416 0.958 NA     0.958
# 13         72 72315 76.2… 23.2… 13.0…    94   246    38  4344  1072   416  2720  4384  4512   480 0.958  0.958 0.958
# # … with abbreviated variable name ¹​district_id

相关问题