SQL Server 缓变维度数据的重复数据删除

hec6srdp  于 2022-12-03  发布在  其他
关注(0)|答案(1)|浏览(112)

通过下面的代码我试图描述我的问题。基本上我想从缓慢变化的维度数据:

到某些已消除重复数据的数据:

显然我的实际数据更大(更多的ID和值)。在理想的情况下,我还希望尽可能避免使用#Dates帮助器表。我目前的尝试是:

SELECT
    ID
    , Value1
    , Value2
    , MIN(#Dates.TheDate) AS StartDate
    , MAX(#Dates.TheDate) AS EndDate
FROM #Dates
INNER JOIN #Haves ON #Dates.TheDate BETWEEN #Haves.StartDate AND #Haves.EndDate
GROUP BY
    ID
    , Value1
    , Value2

并不产生需求。
复制代码:

IF OBJECT_ID(N'tempdb..#Dates') IS NOT NULL DROP TABLE #Dates;
IF OBJECT_ID(N'tempdb..#Haves') IS NOT NULL DROP TABLE #Haves;
IF OBJECT_ID(N'tempdb..#Wants') IS NOT NULL DROP TABLE #Wants;

DECLARE @FromDate DATETIME, @ToDate DATETIME;
SET @FromDate = '2020-01-01';
SET @ToDate = '2020-01-31';

-- all days in that period
SELECT TOP (DATEDIFF(DAY, @FromDate, @ToDate)+1)
  TheDate = DATEADD(DAY, number, @FromDate)
INTO #Dates
FROM [master].dbo.spt_values
WHERE [type] = N'P' ORDER BY number;

SELECT * FROM #Dates

SELECT
    *
INTO #Haves
FROM (SELECT 1 ID, '2020-01-01' AS StartDate, '2020-01-03' AS EndDate, 1 Value1, 1 Value2
      UNION
      SELECT 1 ID, '2020-01-03' AS StartDate, '2020-01-05' AS EndDate, 1 Value1, 1 Value2
      UNION
      SELECT 1 ID, '2020-01-05' AS StartDate, '2020-01-07' AS EndDate, 3 Value1, 1 Value2
      UNION
      SELECT 1 ID, '2020-01-07' AS StartDate, '2999-01-01' AS EndDate, 1 Value1, 1 Value2
) AS IQ1;

SELECT * from #Haves

SELECT
    ID
    , Value1
    , Value2
    , MIN(#Dates.TheDate) AS StartDate
    , MAX(#Dates.TheDate) AS EndDate
FROM #Dates
INNER JOIN #Haves ON #Dates.TheDate BETWEEN #Haves.StartDate AND #Haves.EndDate
GROUP BY
    ID
    , Value1
    , Value2

SELECT
    *
INTO #Wants
FROM (SELECT 1 ID, '2020-01-01' AS StartDate, '2020-01-05' AS EndDate, 1 Value1, 1 Value2
      UNION
      SELECT 1 ID, '2020-01-05' AS StartDate, '2020-01-07' AS EndDate, 3 Value1, 1 Value2
      UNION
      SELECT 1 ID, '2020-01-07' AS StartDate, '2999-01-01' AS EndDate, 1 Value1, 1 Value2
) AS IQ1;

SELECT * FROM #Wants
zujrkrfu

zujrkrfu1#

这就是所谓的间隙与孤立问题。您想要侦测连续数据列的群组。屏幕撷取画面中的数据列#1和#2会被视为一个群组,因为数据列#1的结束日期相等数据列#2的开始日期,且id、value1和value2相等。
因此,让我们首先检测所有组的更改。然后将更改计数到一行以获得组编号。然后聚合以获得每个组的开始和结束日期。

SELECT
  id,
  MIN(startdate) AS startdate,
  MAX(enddate) AS enddate,
  MIN(value1) AS value1,
  MIN(value2) AS value2
FROM
(
  SELECT
    id, startdate, enddate, value1, value2,
    SUM(chg) OVER (PARTITION BY id ORDER BY startdate) AS grp
  FROM
  (
    SELECT
      id, startdate, enddate, value1, value2,
      CASE WHEN startdate = LAG(enddate) OVER (PARTITION BY id ORDER BY startdate)
            AND value1    = LAG(value1)  OVER (PARTITION BY id ORDER BY startdate)
            AND value2    = LAG(value2)  OVER (PARTITION BY id ORDER BY startdate)
        THEN 0
        ELSE 1
      END AS chg
    FROM #Haves
  ) with_change_flags
) with_groups
GROUP BY id, grp
ORDER BY id, grp;

相关问题