Data & Analytics

데이터 품질 체크 자동화: NULL, 중복, 일관성 검증 SQL 템플릿

데이터 품질 이슈를 사전에 잡는 SQL 체크리스트. NULL, 중복, 참조 무결성, 범위 검증.

데이터 품질 체크 자동화: NULL, 중복, 일관성 검증 SQL 템플릿

데이터 품질 체크를 SQL로: 신뢰할 수 있는 데이터 만들기

데이터 분석 전 필수! NULL, 중복, 범위, 일관성 체크를 SQL로 자동화하기.

TL;DR

  • 데이터 품질: 분석 결과의 신뢰성을 결정하는 핵심 요소
  • 체크 항목: 완전성, 유일성, 유효성, 일관성, 적시성
  • 자동화: 정기적 품질 검증으로 문제 조기 발견
  • 리포팅: 품질 점수 대시보드로 현황 파악

1. 왜 데이터 품질이 중요한가?

Garbage In, Garbage Out

sql
매출 리포트: 1억 2천만원
실제 매출: 8천만원

→ NULL 값을 0으로 처리 안 함
→ 중복 주문이 포함됨
→ 취소 건이 제외 안 됨

잘못된 데이터 → 잘못된 의사결정 → 비용 손실

데이터 품질의 5가지 차원

차원설명예시
완전성필수 데이터 존재 여부이메일 NULL 비율
유일성중복 없음동일 주문 중복 입력
유효성값의 범위/형식 적합나이가 -5 또는 200
일관성데이터 간 논리적 정합주문금액 ≠ 상품금액 합계
적시성데이터 최신성3일 전 데이터가 최신

2. 완전성 체크 (NULL 검사)

테이블별 NULL 비율 확인

sql
-- 각 컬럼별 NULL 비율
SELECT
    'user_id' as column_name,
    COUNT(*) as total_rows,
    SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) as null_count,
    ROUND(SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as null_pct
FROM orders

UNION ALL

SELECT
    'email',
    COUNT(*),
    SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END),
    ROUND(SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
FROM orders

UNION ALL

SELECT
    'amount',
    COUNT(*),
    SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END),
    ROUND(SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
FROM orders

ORDER BY null_pct DESC;

결과:

column_nametotal_rowsnull_countnull_pct
email10000052345.23
amount100000120.01
user_id10000000.00

동적 NULL 체크 (PostgreSQL)

sql
-- 모든 컬럼 자동 검사 (information_schema 활용)
DO $$
DECLARE
    col RECORD;
    null_count INTEGER;
    total_count INTEGER;
BEGIN
    SELECT COUNT(*) INTO total_count FROM orders;

    FOR col IN
        SELECT column_name
        FROM information_schema.columns
        WHERE table_name = 'orders'
    LOOP
        EXECUTE format('SELECT COUNT(*) FROM orders WHERE %I IS NULL', col.column_name)
        INTO null_count;

        RAISE NOTICE '% : % / % (%.2f%%)',
            col.column_name, null_count, total_count,
            null_count * 100.0 / total_count;
    END LOOP;
END $$;

빈 문자열도 체크

sql
-- NULL + 빈 문자열 + 공백만 있는 경우
SELECT
    COUNT(*) as total,
    SUM(CASE
        WHEN email IS NULL
          OR TRIM(email) = ''
        THEN 1 ELSE 0
    END) as empty_count,
    ROUND(SUM(CASE
        WHEN email IS NULL
          OR TRIM(email) = ''
        THEN 1 ELSE 0
    END) * 100.0 / COUNT(*), 2) as empty_pct
FROM users;

3. 유일성 체크 (중복 검사)

PK 중복 확인

sql
-- order_id가 유일해야 하는 경우
SELECT
    order_id,
    COUNT(*) as duplicate_count
FROM orders
GROUP BY order_id
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC;

비즈니스 키 중복

sql
-- 같은 사용자가 같은 시간에 같은 상품을 주문 (중복 의심)
SELECT
    user_id,
    product_id,
    DATE_TRUNC('minute', created_at) as order_minute,
    COUNT(*) as count
FROM orders
GROUP BY user_id, product_id, DATE_TRUNC('minute', created_at)
HAVING COUNT(*) > 1
ORDER BY count DESC;

중복률 리포트

sql
WITH duplicate_analysis AS (
    SELECT
        order_id,
        COUNT(*) as occurrence
    FROM orders
    GROUP BY order_id
)
SELECT
    COUNT(*) as total_unique_ids,
    SUM(occurrence) as total_rows,
    SUM(CASE WHEN occurrence > 1 THEN occurrence ELSE 0 END) as duplicate_rows,
    ROUND(
        SUM(CASE WHEN occurrence > 1 THEN occurrence ELSE 0 END) * 100.0 / SUM(occurrence),
        4
    ) as duplicate_pct
FROM duplicate_analysis;

4. 유효성 체크 (범위/형식 검사)

숫자 범위 검사

sql
-- 비정상적인 값 찾기
SELECT
    'amount' as field,
    COUNT(*) as invalid_count,
    MIN(amount) as min_value,
    MAX(amount) as max_value
FROM orders
WHERE amount < 0 OR amount > 100000000  -- 1억 초과

UNION ALL

SELECT
    'quantity',
    COUNT(*),
    MIN(quantity),
    MAX(quantity)
FROM order_items
WHERE quantity <= 0 OR quantity > 1000

UNION ALL

SELECT
    'discount_rate',
    COUNT(*),
    MIN(discount_rate),
    MAX(discount_rate)
FROM promotions
WHERE discount_rate < 0 OR discount_rate > 1;  -- 0~100% 범위

날짜 유효성 검사

sql
-- 미래 날짜, 너무 오래된 날짜
SELECT
    order_id,
    created_at,
    CASE
        WHEN created_at > CURRENT_TIMESTAMP THEN 'FUTURE_DATE'
        WHEN created_at < '2020-01-01' THEN 'TOO_OLD'
        ELSE 'VALID'
    END as date_status
FROM orders
WHERE created_at > CURRENT_TIMESTAMP
   OR created_at < '2020-01-01';

형식 검사 (정규식)

sql
-- 이메일 형식 검사
SELECT
    email,
    CASE
        WHEN email IS NULL THEN 'NULL'
        WHEN email !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 'INVALID_FORMAT'
        ELSE 'VALID'
    END as email_status
FROM users
WHERE email IS NULL
   OR email !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$';

-- 전화번호 형식 검사
SELECT
    phone,
    CASE
        WHEN phone IS NULL THEN 'NULL'
        WHEN phone !~ '^\d{2,3}-\d{3,4}-\d{4}$' THEN 'INVALID_FORMAT'
        ELSE 'VALID'
    END as phone_status
FROM users
WHERE phone IS NOT NULL
  AND phone !~ '^\d{2,3}-\d{3,4}-\d{4}$';

ENUM 값 검사

sql
-- 허용된 값만 있는지 확인
SELECT
    status,
    COUNT(*) as count
FROM orders
WHERE status NOT IN ('pending', 'confirmed', 'shipped', 'delivered', 'cancelled')
GROUP BY status;

-- 허용 값 목록과 비교
WITH allowed_values AS (
    SELECT unnest(ARRAY['pending', 'confirmed', 'shipped', 'delivered', 'cancelled']) as status
)
SELECT DISTINCT o.status as invalid_status
FROM orders o
LEFT JOIN allowed_values av ON o.status = av.status
WHERE av.status IS NULL;

5. 일관성 체크 (논리적 정합성)

금액 일관성

sql
-- 주문 금액 = 상품 금액 합계 - 할인 + 배송비
WITH order_calculated AS (
    SELECT
        o.order_id,
        o.total_amount as recorded_amount,
        COALESCE(SUM(oi.quantity * oi.unit_price), 0) as items_total,
        COALESCE(o.discount_amount, 0) as discount,
        COALESCE(o.shipping_fee, 0) as shipping
    FROM orders o
    LEFT JOIN order_items oi ON o.order_id = oi.order_id
    GROUP BY o.order_id, o.total_amount, o.discount_amount, o.shipping_fee
)
SELECT
    order_id,
    recorded_amount,
    items_total - discount + shipping as calculated_amount,
    recorded_amount - (items_total - discount + shipping) as difference
FROM order_calculated
WHERE ABS(recorded_amount - (items_total - discount + shipping)) > 1  -- 1원 이상 차이
ORDER BY ABS(difference) DESC
LIMIT 100;

날짜 순서 일관성

sql
-- 주문일 < 결제일 < 배송일 < 완료일 순서 확인
SELECT
    order_id,
    order_date,
    payment_date,
    shipped_date,
    delivered_date,
    CASE
        WHEN payment_date < order_date THEN 'PAYMENT_BEFORE_ORDER'
        WHEN shipped_date < payment_date THEN 'SHIPPED_BEFORE_PAYMENT'
        WHEN delivered_date < shipped_date THEN 'DELIVERED_BEFORE_SHIPPED'
        ELSE 'VALID'
    END as date_consistency
FROM orders
WHERE payment_date < order_date
   OR shipped_date < payment_date
   OR delivered_date < shipped_date;

FK 참조 무결성

sql
-- orders.user_id가 users 테이블에 존재하는지
SELECT
    o.order_id,
    o.user_id as orphan_user_id
FROM orders o
LEFT JOIN users u ON o.user_id = u.user_id
WHERE u.user_id IS NULL;

-- 존재하지 않는 product_id 참조
SELECT
    oi.order_item_id,
    oi.product_id as orphan_product_id
FROM order_items oi
LEFT JOIN products p ON oi.product_id = p.product_id
WHERE p.product_id IS NULL;

상태 전이 일관성

sql
-- 잘못된 상태 변경 찾기 (예: pending에서 바로 delivered)
WITH status_transitions AS (
    SELECT
        order_id,
        status,
        LAG(status) OVER (PARTITION BY order_id ORDER BY updated_at) as prev_status,
        updated_at
    FROM order_status_history
)
SELECT *
FROM status_transitions
WHERE prev_status IS NOT NULL
  AND NOT (
    (prev_status = 'pending' AND status IN ('confirmed', 'cancelled'))
    OR (prev_status = 'confirmed' AND status IN ('shipped', 'cancelled'))
    OR (prev_status = 'shipped' AND status IN ('delivered', 'returned'))
    OR (prev_status = 'delivered' AND status = 'returned')
  );

6. 적시성 체크 (데이터 신선도)

최신 데이터 확인

sql
-- 테이블별 마지막 업데이트 시간
SELECT
    'orders' as table_name,
    MAX(created_at) as latest_record,
    CURRENT_TIMESTAMP - MAX(created_at) as data_lag
FROM orders

UNION ALL

SELECT
    'events',
    MAX(event_time),
    CURRENT_TIMESTAMP - MAX(event_time)
FROM events

UNION ALL

SELECT
    'users',
    MAX(updated_at),
    CURRENT_TIMESTAMP - MAX(updated_at)
FROM users

ORDER BY data_lag DESC;

결과:

table_namelatest_recorddata_lag
events2024-01-15 09:30:0000:30:00
orders2024-01-15 09:15:0000:45:00
users2024-01-14 23:00:0010:30:00

시간대별 데이터 유입 확인

sql
-- 시간별 데이터 유입량 (갑자기 0이면 문제)
SELECT
    DATE_TRUNC('hour', created_at) as hour,
    COUNT(*) as record_count
FROM orders
WHERE created_at >= CURRENT_DATE - INTERVAL '3 days'
GROUP BY DATE_TRUNC('hour', created_at)
ORDER BY hour DESC;

누락된 날짜 찾기

sql
WITH RECURSIVE date_series AS (
    SELECT DATE '2024-01-01' as date
    UNION ALL
    SELECT date + 1
    FROM date_series
    WHERE date < CURRENT_DATE - 1
),
daily_counts AS (
    SELECT DATE(created_at) as date, COUNT(*) as cnt
    FROM orders
    WHERE created_at >= '2024-01-01'
    GROUP BY DATE(created_at)
)
SELECT
    ds.date as missing_date
FROM date_series ds
LEFT JOIN daily_counts dc ON ds.date = dc.date
WHERE dc.date IS NULL
ORDER BY ds.date;

7. 종합 품질 리포트

테이블별 품질 점수

sql
WITH completeness AS (
    SELECT
        'completeness' as dimension,
        100 - ROUND(
            (SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) +
             SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) +
             SUM(CASE WHEN created_at IS NULL THEN 1 ELSE 0 END)) * 100.0 /
            (COUNT(*) * 3), 2
        ) as score
    FROM orders
),
uniqueness AS (
    SELECT
        'uniqueness' as dimension,
        100 - ROUND(
            (SELECT COUNT(*) FROM (
                SELECT order_id FROM orders GROUP BY order_id HAVING COUNT(*) > 1
            ) dups) * 100.0 / COUNT(*), 2
        ) as score
    FROM orders
),
validity AS (
    SELECT
        'validity' as dimension,
        100 - ROUND(
            SUM(CASE
                WHEN amount < 0 OR amount > 100000000
                  OR created_at > CURRENT_TIMESTAMP
                THEN 1 ELSE 0
            END) * 100.0 / COUNT(*), 2
        ) as score
    FROM orders
),
timeliness AS (
    SELECT
        'timeliness' as dimension,
        CASE
            WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '1 hour' THEN 100
            WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '1 day' THEN 80
            WHEN MAX(created_at) > CURRENT_TIMESTAMP - INTERVAL '7 days' THEN 50
            ELSE 0
        END as score
    FROM orders
)
SELECT * FROM completeness
UNION ALL SELECT * FROM uniqueness
UNION ALL SELECT * FROM validity
UNION ALL SELECT * FROM timeliness;

결과:

dimensionscore
completeness99.87
uniqueness99.99
validity99.95
timeliness100.00

일별 품질 추이

sql
WITH daily_quality AS (
    SELECT
        DATE(created_at) as date,
        COUNT(*) as total_records,
        -- 완전성
        ROUND(100 - SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as completeness,
        -- 유효성
        ROUND(100 - SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as validity
    FROM orders
    WHERE created_at >= CURRENT_DATE - INTERVAL '30 days'
    GROUP BY DATE(created_at)
)
SELECT
    date,
    total_records,
    completeness,
    validity,
    ROUND((completeness + validity) / 2, 2) as avg_quality_score
FROM daily_quality
ORDER BY date DESC;

8. 자동화된 품질 체크

품질 규칙 테이블

sql
-- 품질 규칙 정의 테이블
CREATE TABLE data_quality_rules (
    rule_id SERIAL PRIMARY KEY,
    table_name VARCHAR(100),
    rule_name VARCHAR(200),
    rule_type VARCHAR(50),  -- completeness, uniqueness, validity, consistency
    check_query TEXT,
    threshold DECIMAL(5,2),  -- 허용 기준 (예: 99.0 = 99% 이상)
    is_active BOOLEAN DEFAULT TRUE
);

-- 규칙 예시 삽입
INSERT INTO data_quality_rules (table_name, rule_name, rule_type, check_query, threshold) VALUES
('orders', 'order_id_not_null', 'completeness',
 'SELECT 100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*) FROM orders', 100),
('orders', 'order_id_unique', 'uniqueness',
 'SELECT 100 - (SELECT COUNT(*) FROM (SELECT order_id FROM orders GROUP BY order_id HAVING COUNT(*) > 1) d) * 100.0 / COUNT(*) FROM orders', 100),
('orders', 'amount_positive', 'validity',
 'SELECT 100 - COUNT(*) FILTER (WHERE amount < 0) * 100.0 / COUNT(*) FROM orders', 99.9);

품질 체크 실행 및 결과 저장

sql
-- 품질 체크 결과 테이블
CREATE TABLE data_quality_results (
    result_id SERIAL PRIMARY KEY,
    rule_id INT REFERENCES data_quality_rules(rule_id),
    check_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    score DECIMAL(5,2),
    passed BOOLEAN,
    details JSONB
);

-- 실행 예시 (실제로는 프로시저나 스케줄러로 실행)
INSERT INTO data_quality_results (rule_id, score, passed)
SELECT
    1 as rule_id,
    100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*) as score,
    (100 - COUNT(*) FILTER (WHERE order_id IS NULL) * 100.0 / COUNT(*)) >= 100 as passed
FROM orders;

품질 알림 쿼리

sql
-- 기준 미달 항목 알림
SELECT
    r.table_name,
    r.rule_name,
    r.rule_type,
    r.threshold as required_score,
    qr.score as actual_score,
    qr.check_timestamp
FROM data_quality_rules r
JOIN data_quality_results qr ON r.rule_id = qr.rule_id
WHERE qr.check_timestamp = (
    SELECT MAX(check_timestamp)
    FROM data_quality_results qr2
    WHERE qr2.rule_id = qr.rule_id
)
AND qr.passed = FALSE
ORDER BY qr.score ASC;

9. 데이터 프로파일링

컬럼 통계 요약

sql
-- 숫자 컬럼 프로파일링
SELECT
    'amount' as column_name,
    COUNT(*) as total_count,
    COUNT(DISTINCT amount) as distinct_count,
    SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) as null_count,
    MIN(amount) as min_value,
    MAX(amount) as max_value,
    ROUND(AVG(amount), 2) as avg_value,
    ROUND(STDDEV(amount), 2) as stddev_value,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_value
FROM orders;

카테고리 컬럼 분포

sql
-- 상위 값 분포
WITH value_counts AS (
    SELECT
        status,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
    FROM orders
    GROUP BY status
)
SELECT
    status,
    count,
    percentage,
    REPEAT('█', (percentage / 5)::INT) as bar
FROM value_counts
ORDER BY count DESC;

결과:

statuscountpercentagebar
delivered4500045.00█████████
shipped2500025.00█████
confirmed1500015.00███
pending1000010.00██
cancelled50005.00

10. 품질 이슈 수정

NULL 값 처리

sql
-- 기본값으로 대체 (UPDATE 전 백업 권장)
UPDATE orders
SET shipping_fee = 0
WHERE shipping_fee IS NULL;

-- 다른 테이블에서 값 가져오기
UPDATE orders o
SET user_email = u.email
FROM users u
WHERE o.user_id = u.user_id
  AND o.user_email IS NULL;

중복 제거

sql
-- 중복 중 최신 것만 남기기
WITH duplicates AS (
    SELECT
        order_id,
        ROW_NUMBER() OVER (
            PARTITION BY user_id, product_id, DATE_TRUNC('minute', created_at)
            ORDER BY created_at DESC
        ) as rn
    FROM orders
)
DELETE FROM orders
WHERE order_id IN (
    SELECT order_id FROM duplicates WHERE rn > 1
);

이상값 플래그

sql
-- 삭제 대신 플래그로 표시
ALTER TABLE orders ADD COLUMN data_quality_flag VARCHAR(50);

UPDATE orders
SET data_quality_flag = 'INVALID_AMOUNT'
WHERE amount < 0 OR amount > 100000000;

UPDATE orders
SET data_quality_flag = 'FUTURE_DATE'
WHERE created_at > CURRENT_TIMESTAMP;

결론

체크 유형주요 확인 사항SQL 기법
완전성NULL, 빈 문자열COUNT + CASE WHEN NULL
유일성PK/UK 중복GROUP BY + HAVING
유효성범위, 형식WHERE + 정규식
일관성FK, 계산값, 상태JOIN + 비교
적시성최신 데이터MAX(timestamp)

데이터 품질 관리 핵심:

  • 정기적 모니터링으로 문제 조기 발견
  • 자동화된 규칙으로 일관된 검증
  • 품질 점수 대시보드로 현황 파악
  • 이슈 수정 전 반드시 백업

References

  1. DAMA - Data Quality Dimensions
  2. Great Expectations - Data Validation Framework
  3. dbt - Data Build Tool Testing