WITH
users AS (
SELECT
* EXCEPT (timestamp), timestamp as registrationTimestamp
FROM
`logs.registeredUsers`),
initialReviews AS (
SELECT
DISTINCT userID,
registrationTimestamp,
schedule,
cardID
FROM
(SELECT * EXCEPT (reviewCount), RANK() OVER (PARTITION BY userID, cardID ORDER BY timestamp) AS reviewCount FROM `logs.reviews`)
JOIN
`users`
USING
(userID)
WHERE
sessionID IS NULL AND reviewCount = 1 AND
cardID IN(
SELECT
cardID
FROM
`logs.latestEssaysCards`
WHERE
essayName="qcvc")
),
achievers AS (SELECT DISTINCT userID, MIN(beforeInterval) / (1000*60*60*24) AS firstInterval FROM `logs.reviews` JOIN initialReviews USING (userID, cardID) WHERE sessionID IS NOT NULL AND isRetry IS NOT TRUE AND beforeInterval > 1000*60*60*24 AND reviewMarking="remembered" GROUP BY userID HAVING COUNT(DISTINCT cardID) > 50),
subsequentReviews AS (
SELECT
*,
TIMESTAMP_DIFF(MAX(timestamp) OVER (PARTITION BY userID, cardID ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), MIN(timestamp) OVER (PARTITION BY userID, cardID ORDER BY timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING), DAY) AS span,
RANK() OVER (PARTITION BY userID, cardID ORDER BY timestamp) AS realReviewCount,
FROM
`logs.reviews`
JOIN
initialReviews
USING
(userID,
cardID)
WHERE
sessionID IS NOT NULL AND isRetry IS NOT TRUE),
samples AS (
SELECT *, RANK() OVER (PARTITION BY cardID, userID ORDER BY timestamp) AS delayedRecallCount FROM subsequentReviews WHERE reviewMarking="remembered" AND beforeInterval > 1000*60*60*24
),
byUser AS (
SELECT userID, ANY_VALUE(firstInterval) AS firstInterval, SUM(realReviewCount) / COUNT(DISTINCT cardID) AS averageReviewCount FROM samples JOIN achievers USING (userID) WHERE delayedRecallCount = 1 GROUP BY userID
)
SELECT
firstInterval AS days,
APPROX_QUANTILES(averageReviewCount, 10),
COUNT(DISTINCT userID) AS N,
COUNT(*) AS promptN,
FROM
byUser
GROUP BY
days
ORDER BY
days