강의

멘토링

로드맵

인프런 커뮤니티 질문&답변

김희은님의 프로필 이미지
김희은

작성한 질문수

[빠짝스터디 2주차 과제] 윈도우함수 연습문제

작성

·

45

0


-- 함수(컬럼) OVER (PARTITION BY 파티션 컬럼 ORDER BY 정렬할 컬럼) AS 
-- LEAD, LAG  -> 함수(컬럼,순서). 순서를 명시하지 않으면 디폴트 1

-- 1) 

select 
  *,
  LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead_visit_month,
  LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead2_visit_month
FROM advanced.analytics_function_01
ORDER BY user_id
;
-- 여기서 *를 쓰고 진행하는 이유가 있나?

-- 2) 
select 
  *,
  LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month,
  LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month,
  LAG(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS before_visit_month
FROM advanced.analytics_function_01
ORDER BY user_id, visit_month
;

-- 3) 

select 
  *,
  LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month,
  LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) - visit_month
FROM advanced.analytics_function_01
ORDER BY user_id, visit_month
;

-- 서브쿼리
SELECT 
  *,
  after_visit_month-visit_month AS diff
FROM (
  select 
    *,
    LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month
  FROM advanced.analytics_function_01
)
ORDER BY user_id, visit_month
-- 쿼리가 길어지는 것을 무서워하지말고 쿼리를 덜 수정하는 방향으로 하자 

-- # 추가문제 : 이 데이터셋 기준으로 user_id의 첫번째 방문월, 마지막방문월 구하는 쿼리 작성하기 first_value, last_value
SELECT 
  *,
  FIRST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS first_visit_month,
  LAST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS last_visit_month
FROM advanced.analytics_function_01
ORDER BY user_id, visit_month
;

-- 4) 시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요
SELECT
  *,
  SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_sum,
  SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum2
FROM (
  SELECT
    query_date,
    team,
    user,
    COUNT(user) AS query_cnt
  FROM advanced.query_logs
  GROUP BY ALL
)
ORDER BY user, query_date

-- 5)
WITH raw_data AS (
	SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL
	SELECT DATE '2024-05-02', 13 UNION ALL
	SELECT DATE '2024-05-03', NULL UNION ALL
	SELECT DATE '2024-05-04', 16 UNION ALL
	SELECT DATE '2024-05-05', NULL UNION ALL
	SELECT DATE '2024-05-06', 18 UNION ALL
	SELECT DATE '2024-05-07' , 20 UNION ALL
	SELECT DATE '2024-05-08', NULL UNION ALL
	SELECT DATE '2024-05-09', 13 UNION ALL
	SELECT DATE '2024-05-10', 14 UNION ALL
	SELECT DATE '2024-05-11', NULL UNION ALL
	SELECT DATE '2024-05-12', NULL UNION ALL
)

SELECT
	*,
	LAST_VALUE(number_of_order IGNORE NULLS) OVER (ORDER BY date) AS last_value_orders
FROM raw_data

-- 6)
WITH raw_data AS (
	SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL
	SELECT DATE '2024-05-02', 13 UNION ALL
	SELECT DATE '2024-05-03', NULL UNION ALL
	SELECT DATE '2024-05-04', 16 UNION ALL
	SELECT DATE '2024-05-05', NULL UNION ALL
	SELECT DATE '2024-05-06', 18 UNION ALL
	SELECT DATE '2024-05-07' , 20 UNION ALL
	SELECT DATE '2024-05-08', NULL UNION ALL
	SELECT DATE '2024-05-09', 13 UNION ALL
	SELECT DATE '2024-05-10', 14 UNION ALL
	SELECT DATE '2024-05-11', NULL UNION ALL
	SELECT DATE '2024-05-12', NULL
), filled_data AS (
	SELECT
		* EXCEPT(number_of_orders),
		**LAST_VALUE**(number_of_orders **IGNORE NULLS**) OVER (ORDER BY date) AS number_of_orders
	FROM raw_data
)

SELECT
	*,
	AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg
FROM filled_data



-- 7) app_logs 테이블에서 Custom Session을 만들어 주세요. 이전 이벤트 로그와 20초가 지나면 새로운 Session을 만들어 주세요. Session은 숫자로 (1, 2, 3 …) 표시해도 됩니다
-- 2022-08-18일의 user_pseudo_id(1997494153.8491999091)은 session_id가 4까지 나옵니다
WITH base AS (
  SELECT
    event_date,
    DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime,
    event_name,
    user_id,
    user_pseudo_id
    -- event_params가 필요하면 UNNEST => PIVOT해서 사용하면 됨
  FROM advanced.app_logs 
  WHERE 
    event_date = "2022-08-18"
    AND user_pseudo_id = "1997494153.8491999091"
  -- WHERE 조건에 이벤트를 필터링해서 계산해도 괜찮음. 여기서는 필터링을 하지 않고, 진행
), diff_data AS (
  SELECT
    *,
    DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff
    # second_diff 기반으로 새로운 세션의 시작일지, 아닐지를 판단할 수 있음
  FROM (
    SELECT
      *,
      LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime
      # event_datetime이랑 prev_event_datetime을 빼서 20초가 넘으면 새로운 세션으로 정의. 
      # 20초가 넘지 않으면 기존 세션
      -- DATETIME_DIFF() => 차이를 구할 수 있음
    FROM base
  )
)

SELECT
  *,
  # 누적합을 사용해서 session_number를 만들었다!
  SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num
  # session을 구할 때 쿼리가 길어질 수 있음. 하루에 접속을 여러번 하는 서비스 => session 기반이 좋을 수 있고, 아니라고 하면 일자별 유저 집계가 나을 수 있다
FROM (
  SELECT
    *, 
    CASE
      WHEN prev_event_datetime IS NULL THEN 1
      WHEN second_diff >= 30 THEN 1 # session을 나누는 기준 초. 데이터를 탐색하면서 결정. 보통 앱 로그에서는 30초, 60초
      ELSE 0
      END AS session_start
      # session이 시작됨을 알리는 session_start
  FROM diff_data
)
ORDER BY event_datetime



답변

답변을 기다리고 있는 질문이에요
첫번째 답변을 남겨보세요!
김희은님의 프로필 이미지
김희은

작성한 질문수

질문하기