-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathBrainteaser Exercises - Final.sql
172 lines (154 loc) · 3.74 KB
/
Brainteaser Exercises - Final.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/*
Brainteaser Exercises - Final
A set of brainteaser exercise to consolidate SQL lessons
*/
/*
This exercise analyses the data quality of BadMessage table, that contains list of messages.
*/
SELECT
bm.MessageId
, bm.ReceivedDate
, bm.Region
, bm.Category
, bm.Movement
FROM
BadMessage bm
ORDER BY
bm.MessageId;
/*
We have been told that the MessageId column has unique and contiguous values
i.e. there are no dupicates and no missing values (so no gaps in the sequence of MessageId values).
There are three tasks:
1. Check for duplicates in the MessageId column
2. Differentiate between identical and conflicting duplicates
3. Check for missing MessageId values
*/
/*
Task 1: . Let's check for duplicates in the MessageId column. If there are duplicates,
(1) list the duplicate MessageId values and the number of times they are duplicated
(2) list (all columns of) the duplicated rows
Consider two approaches; one that uses Window functions and one that does not.
*/
--
-- answer
-- (1) List the duplicate MessageId values and the number of times they are duplicated
-- Group By Message Id, counting the rows and filter when the count is greater than 1
SELECT
bm.MessageId
, COUNT(*) AS MessageCountById
FROM
BadMessage bm
GROUP BY
bm.MessageId
HAVING
COUNT(*) > 1;
-- (2) list (all columns of) the duplicated rows
SELECT * FROM BadMessage bm
WHERE MessageId IN (
SELECT
bm.MessageId
FROM
BadMessage bm
GROUP BY
bm.MessageId
HAVING
COUNT(*) > 1)
ORDER BY bm.MessageId ;
-- Window functions approach
WITH cte AS (
SELECT
bm.MessageId
-- , bm.ReceivedDate
-- , bm.Region
-- , bm.Category
-- , bm.Movement
, COUNT(*) OVER (PARTITION BY bm.MessageId) AS MessageCount
FROM
BadMessage bm
)
SELECT * FROM cte
WHERE cte.MessageCount > 1
ORDER BY
cte.MessageId;
-- An alternative Window Functions approach
WITH cte (MessageId, MessageCount) AS (
SELECT
bm.MessageId
, COUNT(*) OVER (PARTITION BY bm.MessageId)
FROM
BadMessage bm
)
,
cte2 (MessageId) AS (
SELECT
DISTINCT cte.MessageId
FROM
cte
WHERE
cte.MessageCount > 1
)
SELECT
bm.*
FROM
BadMessage bm
INNER JOIN cte2 ON
bm.MessageId = cte2.MessageId
ORDER BY
bm.MessageId
/*
Task 2: differentiate between identical and conflicting duplicates
* identical duplicates have the same values in all columns
* conflicting duplicates have different values in at least one column
*/
SELECT
bm.MessageId
, bm.ReceivedDate
, bm.Region
, bm.Category
, bm.Movement
, COUNT(*) OVER (PARTITION BY bm.MessageId) AS MessageCount
, COUNT(*) OVER (PARTITION BY bm.MessageId, bm.ReceivedDate, bm.Region, bm.Category, bm.Movement) AS IdenticalRowCount
, CASE WHEN COUNT(*) OVER (PARTITION BY bm.MessageId) =
COUNT(*) OVER (PARTITION BY bm.MessageId, bm.ReceivedDate, bm.Region, bm.Category, bm.Movement)
THEN 'Identical'
ELSE 'Conflicting' END AS DuplicateType
FROM
BadMessage bm
WHERE
bm.MessageId IN (
SELECT
MessageId
FROM
BadMessage
GROUP BY
MessageId
HAVING
COUNT(*) > 1
)
/*
Task 3. Check for missing MessageId values
Hint: the database has a tally (sequence) table named Tally that contains a sequence of numbers from 1 to 100,000
*/
SELECT N as MissingMessageId
from Tally
WHERE N NOT IN (SELECT MessageId
FROM BadMessage)
AND N <= (SELECT MAX(MessageId)
FROM BadMessage)
/*
Another approach is order the rows by MessageId and to calculate the difference between the MessageId of one row and the next
*/
DROP TABLE IF EXISTS #temp
SELECT
bm.MessageId
, bm.ReceivedDate
, bm.Region
, bm.Category
, bm.Movement
, LAG(bm.MessageId) OVER (ORDER BY bm.MessageId) LagMessageId
, bm.MessageId - LAG(bm.MessageId) OVER (ORDER BY bm.MessageId) AS StepMessageId
INTO #temp
FROM
BadMessage bm;
select MessageId - 1 AS MissingMessageId from #temp
WHERE StepMessageId = 2;