如何计算所有工作的总独特工作日 - 没有计算两次的重叠天数

Question

/* Data Setup */
DROP TABLE IF EXISTS #DaysPerJob;
CREATE TABLE #DaysPerJob
(
    GroupID INT, JobDesc VARCHAR(100), StartDate DATE, EndDate DATE
)
INSERT INTO #DaysPerJob(GroupID, JobDesc, StartDate, EndDate) 
VALUES
      (23293, 'Food Prep', '2017-03-01', '2017-07-17')
    , (23293, 'Finisher', '2021-11-19', NULL)
    , (23293, 'Cashier', '2021-12-06', '2021-12-10')
    , (26208, '3rd SHift Stocker', '2019-09-25', '2020-11-05')
    , (26208, 'Order Fulfillment Assoc', '2020-08-05', '2021-04-16')
    , (26208, 'Customer Service Rep', '2021-05-10', '2021-10-15')
    , (26208, 'Delivery Driver', '2021-11-15', NULL)
    , (26208, 'Another Job', '2022-02-23', '2022-03-02')
    , (26208, 'Same Day Job Start as Prev Job End', '2022-03-01', NULL)

--SELECT * FROM #DaysPerJob dpj ORDER BY dpj.GroupID, dpj.StartDate, dpj.EndDate

/* Days Per Job Calculations - Attempts */
SELECT dj.GroupID, dj.JobDesc, dj.StartDate, dj.EndDate
    , LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.GroupID, dj.StartDate, dj.EndDate) AS PreviousJobEndDate
    , DATEDIFF(DAY, dj.StartDate, IsNull(dj.EndDate, GetDate())) AS daysPerJob
FROM #DaysPerJob dj
ORDER BY dj.GroupID, dj.StartDate, dj.EndDate

我如何获得每组唯一受雇天数的总和？

上面的 SQL 会给你一个 table 的工作记录。每个工作都有一个开始日期，但并非所有工作都有一个结束日期，这意味着他们仍然受雇于该工作。

我一直在努力解决的问题是如何计算唯一的受雇天数。使用 DATEDIFF 函数简单地计算每个工作的天数非常容易，但是我目前无法计算同一范围内的其他工作，因为它会计算这些天数两次。

我按开始日期订购，然后使用 LAG 将上一个作业结束日期与下一个作业开始日期进行比较。如果当前作业开始日期 <= 最后一个作业结束日期，我们将使用最后一个作业结束日期到当前作业结束日期来计算下一个作业天数...

但是上述条件有问题...如果我的上一份工作没有结束日期或者如果上一份工作的结束日期也大于当前的工作结束日期怎么办？这意味着整个当前作业与上一个作业在同一范围内，因此我们不应该计算任何天数并且天数将变为 0，以便在计算总天数时不会计算该天数工作。这是最后一期，我无法弄清楚现在是什么导致我在 Stack Overflow 上 post 提出这个问题。

/* Some SQL below of some things I have tried */
/* Days Per Job Calculations - Attempts */
SELECT dj.GroupID, dj.JobDesc, dj.StartDate, dj.EndDate
    , LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.GroupID, dj.StartDate, dj.EndDate) AS PreviousJobEndDate
    
    /* Check if next record is within same date range.  The idea here is if the job is within the
     | same Range we replace the current Jobs Start Date with the last Jobs End Date
    */
    , CASE WHEN ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) >= dj.StartDate 
        AND ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) <= dj.EndDate

        THEN  IsNull( ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ), GetDate() )
        ELSE dj.StartDate

      END AS StartDateForSet
    /* The below CASE is the same logic as the above CASE but just an output stating if the
     | next job was found to be within the same range or if a NEW Set has begun.
    */
    , CASE WHEN ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) >= dj.StartDate 
        AND ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) <= dj.EndDate

        THEN 'InRange' 
        ELSE 'NewSet'

      END AS withinRangeCheck

    , DATEDIFF(DAY, dj.StartDate, IsNull(dj.EndDate, GetDate())) AS daysPerJob
    /* This is the field that I want to use to eventually SUM using GROUPing and aggregate functions however I first 
     | need to get it to correctly output the unique days.  If the current job falls within the previous jobs date
     | range the idea is that this calculation would account for that and move the End Date accordingly so it either
     |  does NOT count any days within the new job or counts the trailing days should the job end date fall after the previous job.
    */
    , DATEDIFF(DAY  /* StartDate */
        ,     (CASE WHEN( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) >= dj.StartDate 
                AND ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) <= dj.EndDate

                THEN IsNull( ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ), GetDate() )
                ELSE dj.StartDate

                END 
                ) 
            /* EndDate If Null Use Current Date */
            , IsNull(dj.EndDate, GetDate())

      ) AS DaysEmployedWithinSet

FROM #DaysPerJob dj
ORDER BY dj.GroupID, dj.StartDate, dj.EndDate

|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|-|-|-|-|-|-|-|-|-|-|-|-|

The Solution to this problem is Below based on the Chosen correct posted answer

|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|-|-|-|-|-|-|-|-|-|-|-|-|

我真的以为这个问题会有更多答案，但这不是一个简单的问题……至少对我来说不是这样，我的同事也无法回答。不管怎样，这个问题有两个答案 posted。一个 post，无论多么接近，都没有提供准确的受雇天数。我三次检查了数据并检查了 Excel 中的计算，并且根据本例中提供的数据集，总计应该看起来像下面使用递归 CTE 创建的 SQL 服务器版本中的那样日期 table.

/* SUM Unique Days in Multiple Date Range Records (SQL Server).sql
 | SQL Server Example 
 | Desc: The below shows how to obtain the unique days employed.  Meaning we don't count the 
 |         same day twice should an individual be employed at more than job at any given time.
*/

/* Data Setup */
DROP TABLE IF EXISTS #DaysPerJob;
CREATE TABLE #DaysPerJob
(
    GroupID INT, JobDesc VARCHAR(100), StartDate DATE, EndDate DATE
)
INSERT INTO #DaysPerJob(GroupID, JobDesc, StartDate, EndDate) 
VALUES
      (23293, 'Food Prep', '2017-03-01', '2017-07-17')
    , (23293, 'Finisher', '2021-11-19', NULL)
    , (23293, 'Starter', '2021-11-21', '2021-12-13')
    , (23293, 'Cashier', '2021-12-06', '2021-12-10')
    , (26208, '3rd SHift Stocker', '2019-09-25', '2020-11-05')
    , (26208, 'Order Fulfillment Assoc', '2020-08-05', '2021-04-16')
    , (26208, 'Customer Service Rep', '2021-05-10', '2021-10-15')
    , (26208, 'Delivery Driver', '2021-11-15', NULL)
    , (26208, 'Another Job', '2022-02-23', '2022-03-02')
    , (26208, 'Same Day Job Start as Prev Job End', '2022-03-01', NULL)
;

/* Using a Recursive CTE to produce a dates table to later be JOINed on */
WITH Dates(date) AS
(
    SELECT MIN(StartDate) AS date
    FROM #DaysPerJob

    UNION ALL

    SELECT DATEADD(DAY, 1, date)
    FROM Dates
    WHERE date < GetDate()
)
, ranked AS
(   /* Needing to rank each job record in order to later remove the overlapping days when employed at more than one job at one time. */
    SELECT j.*, d.*
        , ROW_NUMBER() OVER (PARTITION BY j.GroupID, d.date ORDER BY j.GroupID, j.StartDate, IsNull(j.EndDate, GetDate())) AS ranker
    FROM Dates d
        LEFT JOIN #DaysPerJob j ON j.StartDate <= d.date
                                    AND IsNull(j.EndDate, GetDate()) >= d.date
    WHERE j.GroupID IS NOT NULL /* This filter removes all days in the Dates table where there was no employment */
        --AND j.GroupID = 26208  --23293

    --ORDER BY d.date, j.StartDate, IsNull(j.EndDate, GetDate()), j.GroupID
    --OPTION (MaxRecursion 0) 
)

    /* Non Aggregate Data - UnComment to view */
    /*
    SELECT * FROM ranked r WHERE r.GroupID IS NOT NULL
    ORDER BY r.date, r.StartDate, IsNull(r.EndDate, GetDate()), r.GroupID
    OPTION (MaxRecursion 0)
    */

/* Aggregated Data */
SELECT r.GroupID, COUNT(*) AS daysEmployed, MIN(date) AS minStartDate, MAX(date) AS maxEndDate
    , DATEDIFF(DAY, MIN(date), MAX(date)) AS TotalDaysInRange
    /* To get total number of days NOT employed we simply take the TotalDaysInRange and subtract the daysEmployed */
    , DATEDIFF(DAY, MIN(date), MAX(date)) - COUNT(*) AS unEmployedDays
FROM ranked r
WHERE r.ranker = 1
GROUP BY r.GroupID
ORDER BY r.GroupID
OPTION (MaxRecursion 0) /* The default MaxRecursion setting is 100. Generating more than 100 dates using this method will require the Option (MaxRecursion N) segment of the query, where N is the desired MaxRecursion setting. Setting this to 0 will remove the MaxRecursion limitation altogether */

按 GroupID 分组的总计屏幕截图：

根据截至今天 posting 06.02.22 的屏幕截图，总数为：

GroupID 23293 : 335 Days Employed

GroupID 26208 : 929 Days Employed

Answer 1

这是经过一段时间整理数据后得出的另一个答案。请原谅我，我把它放到了一个更容易使用的格式中。这应该有效。

/* Data Setup */
DROP TABLE IF EXISTS #DaysPerJob;
CREATE TABLE #DaysPerJob
(
    GroupID INT, JobDesc VARCHAR(100), StartDate DATE, EndDate DATE
)
INSERT INTO #DaysPerJob(GroupID, JobDesc, StartDate, EndDate) 
VALUES
        (23293, 'Food Prep', '2017-03-01', '2017-07-17')
    , (23293, 'Finisher', '2021-11-19', NULL)
    , (23293, 'Cashier', '2021-12-06', '2021-12-10')
    , (26208, '3rd SHift Stocker', '2019-09-25', '2020-11-05')
    , (26208, 'Order Fulfillment Assoc', '2020-08-05', '2021-04-16')
    , (26208, 'Customer Service Rep', '2021-05-10', '2021-10-15')
    , (26208, 'Delivery Driver', '2021-11-15', NULL)
    , (26208, 'Another Job', '2022-02-23', '2022-03-02')
    , (26208, 'Same Day Job Start as Prev Job End', '2022-03-01', NULL)

--SELECT * FROM #DaysPerJob dpj ORDER BY dpj.GroupID, dpj.StartDate, dpj.EndDate

/* Days Per Job Calculations - Attempts */

;WITH GapsMarked AS
(
    --Mark the start of an (null) value island within a group and rank the data for window functions below and/or joining back
    SELECT 
        GroupID, JobDesc,StartDate, EndDate,        
        Island = CASE WHEN EndDate IS NULL THEN 1 ELSE 0 END,
        RowInGroup=ROW_NUMBER() OVER(PARTITION BY GroupID ORDER BY StartDate, EndDate)      
    FROM 
        #DaysPerJob
)
,VirtualGroups AS
(
    --Complete the IsIsland within group calculation started above
    SELECT 
        *,
        IsIsland = SUM(Island) OVER (PARTITION BY GroupID ORDER BY RowInGroup ROWS UNBOUNDED PRECEDING)     
    FROM 
        GapsMarked
)
,MinEndDateInIsland AS
(
    --This grabs the Min End Date to compare to the start date of each consecutive island record
    SELECT 
        V1.GroupID, V1.RowInGroup,              
        EndDateOrMinOverlapped=CASE WHEN MIN(V2.EndDate) >= V1.StartDate THEN   MIN(V2.EndDate)  ELSE V1.EndDate END        
    FROM 
        VirtualGroups V1
        LEFT OUTER JOIN VirtualGroups V2 ON V2.GroupID = V1.GroupID AND V2.RowInGroup <= V1.RowInGroup AND V2.IsIsland=0 
    GROUP BY
        V1.GroupID, V1.RowInGroup,V1.StartDate, V1.EndDate
)
--Final output
SELECT 
    G.GroupID, G.JobDesc, G.StartDate, G.EndDate,
    DayCalc=CASE WHEN G.IsIsland=0 THEN DATEDIFF(DAY, G.StartDate,N.EndDateOrMinOverlapped) ELSE NULL END
FROM
    MinEndDateInIsland N
    INNER JOIN VirtualGroups G ON G.GroupID = N.GroupID AND G.RowInGroup= N.RowInGroup
ORDER BY 
    G.GroupID, G.RowInGroup

Answer 2

我无法访问 SqlServer 实例来测试它，所以这是 SQLite 语法，但我认为转换它应该不难。

我采用的方法基本上是使用“日期”table，然后将 DaysPerJob table 加入其中，这样您就可以获取 GroupId 活跃的每一天的记录。然后，您只需根据个人日期和 groupId 进行排名，以用于过滤掉“重叠”工作日。

/* Just using a recursive CTE to create a DATE table */
/* If you have an existing date table, could use that instead */
WITH dates(date) AS (
  SELECT
    MIN(StartDate)
  FROM DaysPerJob
  UNION ALL
  SELECT
    DATE(date, '+1 day')
  FROM dates
  WHERE date < date()
)
, ranked AS (
  SELECT
    d.date
  , j.StartDate
  , j.EndDate
  , j.GroupID
  , j.JobDesc
  , ROW_NUMBER() OVER (PARTITION BY d.date, j.GroupID) AS ranker
  FROM dates d
  LEFT JOIN DaysPerJob j
    ON date(j.StartDate) <= date(d.date)
  AND ifnull(j.EndDate, date()) >= date(d.date)
  WHERE j.GroupID IS NOT NULL
)
SELECT COUNT(*) AS days_worked, GroupID
FROM ranked r
WHERE r.ranker = 1
GROUP BY GroupID;

如何计算所有工作的总独特工作日 - 没有计算两次的重叠天数

How to Calculate the Total Unique Days Employed for All Jobs - No overlap days counted twice

sql-server