sql - 重複するバージョン番号付きの間隔のセットで、各時点で最新のバージョンを見つけます

Question

私は一連の日付間隔で作業しています。各間隔にはバージョン番号があり、新しい間隔は古い間隔と頻繁に重複するか、それらのサブセットになります。このデータから、各時点での最新のバージョン番号を示す新しい間隔のセットを計算する必要があります。この問題に対するセットベースの解決策はありますか？

これがイラストです：

Interval 1: 11111111111111111111111      
Interval 2:     2222222222               
Interval 3:   33333333333333             
Interval 4:                     444444444
Interval 5:                   555555555  
Result    : 11333333333333331155555555544

これが私が扱っているデータのサンプルです：

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   1/1/2011    1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2012  12/31/2012  6
1         10/1/2012  11/1/2012   8

...および目的の出力：

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   10/1/2010   1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2011  10/1/2012   6
1         10/1/2012  11/1/2012   8 << note how version 8 supersedes version 6
1         11/1/2012  12/31/2012  6 << version 6 is split into two records

この問題の他の例は見つかりませんでした。グーグルで検索すると、ギャップや島、またはカバーセットを識別するクエリのみが表示されます。

反復的な解決策があると思います（SQL Server2008）。これは、結果セットの間隔の一時テーブルから始まり、特別なバージョン番号のレコードを挿入することによって、カバーする範囲の開始点と終了点を定義します。次に、結果セットの間隔間のギャップを繰り返し識別し、ギャップがなくなるか、追加するレコードがなくなるまで、元のデータセットの最新のレコードでそれらを埋めようとします。

GO
-- Create data set and results table
CREATE TABLE #Data (
     groupId    INT
    ,startDate  DATE
    ,endDate    DATE
    ,versionId  INT
)

INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)


CREATE TABLE #Results (
     groupId        VARCHAR(10)
    ,startDate  DATE
    ,endDate    DATE
    ,versionId      BIGINT
)

DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20121231'
SET @placeholderId = 999999999999999

INSERT #Results
SELECT DISTINCT
     groupId
    ,CASE WHEN MIN(startDate) < @startDate THEN MIN(startDate) ELSE @startDate END
    ,CASE WHEN MIN(startDate) < @startDate THEN @startDate ELSE MIN(startDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
     groupId
    ,CASE WHEN MAX(endDate) < @endDate THEN MAX(endDate) ELSE @endDate END
    ,CASE WHEN MAX(endDate) < @endDate THEN @endDate ELSE MAX(endDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
GO

-- Fill gaps in results table
DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20111231'
SET @placeholderId = 999999999999999

DECLARE @counter INT
SET @counter = 0

WHILE @counter < 10
BEGIN
    SET @counter = @counter + 1;
    WITH Gaps AS (
        SELECT
             gs.groupId
            ,gs.startDate
            ,MIN(ge.endDate) as endDate
            ,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
        FROM (
            SELECT groupId, endDate as startDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.startDate <= r1.endDate
                        AND r2.endDate > r1.endDate
                )
                AND NOT (endDate >= @endDate AND versionId = @placeholderId)
        ) gs
        INNER JOIN (
            SELECT groupId, startDate as endDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.endDate >= r1.startDate
                        AND r2.startDate < r1.startDate
                )
                AND NOT (startDate <= @startDate AND versionId = @placeholderId)
        ) ge
            ON ge.groupId = gs.groupId
            AND ge.endDate >= gs.startDate
        GROUP BY gs.groupId, gs.startDate
    )
    INSERT #Results (
         groupId
        ,startDate
        ,endDate
        ,versionId
    )
    SELECT
         d.groupId
        ,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
        ,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
        ,d.versionId
    FROM #Data d
    INNER JOIN Gaps g
        ON g.groupId = d.groupId
        AND g.startDate <= d.endDate
        AND g.endDate >= d.startDate
    INNER JOIN (
        SELECT 
             d.groupId
            ,gapId
            ,MAX(d.versionId) as versionId
        FROM #Data d
        INNER JOIN Gaps g
            ON g.groupId = d.groupId
            AND g.startDate <= d.endDate
            AND g.endDate >= d.startDate
        WHERE d.versionId < (
                SELECT MIN(versionId)
                FROM #Results r
                WHERE r.groupId = d.groupId
                    AND (r.startDate = g.endDate OR r.endDate = g.startDate)
            )
            AND NOT EXISTS (
                SELECT *
                FROM #Data dsup
                WHERE dsup.groupId = d.groupId
                    AND dsup.versionId > d.versionId
                    AND dsup.startDate <= d.startDate
                    AND dsup.endDate >= d.endDate
            )
        GROUP BY
             d.groupId
            ,g.gapId
    ) mg
        ON mg.groupId = g.groupId
        AND mg.gapId = g.gapId
        AND mg.versionId = d.versionId
END

SELECT *
FROM #Results
WHERE versionId <> @placeholderId
order by groupId, startDate

セットベースのソリューションの方がはるかに便利ですが、私はそれを見つけるのに苦労しました。何か案は？

score 4 · Accepted Answer

-- create a dates table
create table dates (thedate date primary key clustered);
;with dates(thedate) as (
  select dateadd(yy,years.number,0)+days.number
    from master..spt_values years
    join master..spt_values days
      on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
   where years.type='p' and years.number between 100 and 150
      -- note: 100-150 creates dates in the year range 2000-2050
      --       adjust as required
)
insert dbo.dates select * from dates;

-- for each date, determine the prevailing version
  select t.groupId, d.thedate, max(t.versionId) versionId
    into #tmp1
    from dates d
    join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate;

-- create index to help
create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);

-- find the start dates
;with t as (
   select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
     from #tmp1 a
left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
    where b.versionId is null
)
   select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
     from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
 order by groupId, startdate;

もちろん、すべてを「1つのクエリ」で実行できますが、パフォーマンスが低下するため、危険を冒して実行してください。

使用しないでください-学術的関心のみ-

;with dates(thedate) as (
  select dateadd(yy,years.number,0)+days.number
    from master..spt_values years
    join master..spt_values days
      on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
   where years.type='p' and years.number between 100 and 150
      -- note: 100-150 creates dates in the year range 2000-2050
      --       adjust as required
), tmp1 as (
  select t.groupId, d.thedate, max(t.versionId) versionId
    from dates d
    join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate
), t as (
   select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
     from tmp1 a
left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
    where b.versionId is null
)
   select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
     from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
 order by groupId, startdate;

score 1 · Accepted Answer

コメントからのフィードバックにより更新されました。他の回答で簡単に解決できることが証明されているため、少数の人が指摘したエンドケースについて心配するつもりはありませんが、先に進んで、DDLを必要としない実用的なバージョンを入手したいと思いました。 ..オプションがあるのは良いことだと思います。:-)

このコードは機能するはずです：

select nesty.groupId, nesty.startDate, nesty.segment_end_date, Max(bob.versionId)
from(
select starter.groupId, starter.startDate,
coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31')) AS segment_end_date
from
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
    (select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xx) starter
left outer join
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
    (select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate)    as startDate from #Data) xy) ender on
    starter.groupId = ender.groupId and
    starter.rownumber = ender.rownumber - 1
where
starter.startDate<= coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31'))
) nesty
left outer join #Data bob on
bob.groupId = nesty.groupId and
nesty.segment_end_date between bob.startDate and bob.endDate
group by nesty.groupId, nesty.startDate, nesty.segment_end_date
order by nesty.groupId, nesty.startDate

それを単一のSQLステートメントにするために私がしなければならなかったいくつかの小さな警告があります。まず、最大終了日は動的ではありません。「2012-12-31」をハードコーディングしました。MAX（endDate）に置き換えることはできますが、GROUPBYステートメントに入れることはできません。手順でこれを行うことができる場合は、次のことができます。

select into @max_end_date MAX(endDate) from #Data

'2012-12-31'を@max_end_dateに置き換えます。

次に、隣接する2つのセグメントの値が同じにならないことを保証しません。これはあなたにとって重要かもしれないし、そうでないかもしれません...つまり、あなたが以下を持っていた場合：

Interval 1:       111111      
Interval 2:   22222222222222

出力は次のようになります。

Interval 1:   2222
Interval 2:       2222222222

それでも、単純で効率的なSQLクエリでそれをヒットする価値があると思います。これらの警告を修正するのは難しいことではないかもしれませんが、私が何に取り組んでいるかは問題ではなかったので、私はまだ気にしませんでした。

score 0 · Accepted Answer

終了日とギャップが重要な場合は、次の方法でそれを行うことができます。datetimesこのソリューションは、バージョンが単なる日付ではなく、機能するように適合させることもできます。

最初にたくさんの関数

特定の日付のバージョンを取得するための1つ

Create Function dbo.VersionAtDate(@GroupID int, @Date datetime) Returns int as
Begin
  Declare @Ret int = Null
  Select
    @Ret = Max(VersionID)
  From
    VersionedIntervals iv
  Where
    iv.GroupID = @GroupID And
    iv.StartDate <= @Date And
    iv.EndDate + 1 > @Date -- if dates were half open intervals this would just be iv.EndDate > @Date
  Return @Ret
End

次に、2つの日時の中間点（分の解像度）を取得します。

Create Function dbo.Midpoint(@Start datetime, @End datetime) Returns datetime as
Begin
  Return DateAdd(Minute, DateDiff(Minute, @Start, @End) / 2, @Start)
End

中間点でのバージョン：

Create Function dbo.VersionAtMidpoint(@GroupID int, @Start datetime, @End datetime) returns int as
Begin
  Return dbo.VersionAtDate(@GroupID, dbo.Midpoint(@Start, @End))
End;

最後に、いくつかのポイントが1つの範囲の開始と別の範囲の終了であるという事実を支援するテーブル値関数であり、このために1つの入力から2つの行を取得するのに役立ちます。

-- returns two rows if a point is the end of one interval and the
-- start of another
Create Function dbo.EndPoints(@GroupID int, @RN bigint, @Start datetime, @End datetime, @Next datetime, @Version int)
Returns @EndPoints Table (
    GroupID int,
    RN bigint,
    Version int,
    StartDate datetime,
    EndDate datetime
) As
Begin
  Declare @NextVersion int, @VersionAtMidpoint int
  Set @NextVersion = dbo.VersionAtDate(@GroupID, @Next)
  If @NextVersion = @Version
    -- interval carries on
    Insert Into @EndPoints Select @GroupID, @RN, @Version, @Start, @Next
  Else
  Begin
    -- interval has ended
    Set @VersionAtMidpoint = dbo.VersionAtMidPoint(@GroupID, @End, @Next)
    If @VersionAtMidpoint != @Version
        -- we have something like this, start a run of 3s (run of 4s is already ended by previous call)
        -- 3333333
        -- 44     
        Insert Into @EndPoints Select @GroupID, @RN, @VersionAtMidpoint, @End, @Next 
    Else
    Begin
        -- We have something like this, end the run of 3s and start the run of fours
        -- 33333
        --   444
        Insert Into @EndPoints Select @GroupID, -1, @Version, @Start, @Next
        Insert Into @EndPoints Select @GroupID, @RN, @NextVersion, @Next, @Next
    End
  End
  Return
End

このすべての機械が配置され、最後に再帰CTE plustテーブル変数が配置されたら、maxrecursionを適切に設定する必要があります。

Declare @Bounds Table (GroupID int, RN bigint, BoundDate datetime, Primary Key (GroupID, RN))

Insert Into
    @Bounds
Select
    GroupID,
    Row_Number() Over (Partition By GroupID Order By BoundDate),
    BoundDate
From (
    Select
        GroupID,
        StartDate As BoundDate
    From
        dbo.VersionedIntervals
    Union
    Select
        GroupID,
        EndDate
    From
        dbo.VersionedIntervals
    ) a

;With VersionedBounds (GroupID, RN, StartDate, EndDate, Version) as (
    Select
        GroupID,
        RN,
        BoundDate,
        BoundDate,
        dbo.VersionAtDate(GroupID, BoundDate)
    From
        @Bounds
    Where
        RN = 1
    Union All
    Select
        e.GroupID,
        e.RN,
        e.StartDate,
        e.EndDate,
        e.Version
    From
        @Bounds b
            Inner Join
        VersionedBounds v
            On v.GroupID = b.GroupID And b.RN = v.RN + 1
            Cross Apply 
        dbo.EndPoints(v.GroupID, b.RN, v.StartDate, v.EndDate, b.BoundDate, v.Version) e
)
Select 
    GroupID,
    StartDate,
    Max(EndDate) As EndDate,
    Max(Version) As Version
From
    VersionedBounds
Group By
    GroupID,
    StartDate
Order By
    GroupID,
    StartDate

http://sqlfiddle.com/#!6/b95bd/2

sql - 重複するバージョン番号付きの間隔のセットで、各時点で最新のバージョンを見つけます

3 に答える 3

Related

Reference