-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrace.py
133 lines (107 loc) · 6.91 KB
/
retrace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import argparse, requests, re
from urllib.parse import urlsplit, urljoin, quote
def find_shortest_meta_refresh(location, source_code):
# Initialize shortest delay as positive infinity
shortest_delay = float('inf')
shortest_url = ""
# Regex pattern for isolating meta refresh redirects
pattern = r'<meta\s+http-equiv=(["\'])refresh\1\s*content=(["\'])([\d\.]+)[;,]?(?:\s*(?:url=)?(?!\2)(?:["\'])?\s*(?:http:(?!\/\/))?([^"\']+)(?:["\'])?\2)?'
# Find all meta refresh tags using regex
matches = re.findall(pattern, source_code)
# Remove all capturing groups used to make sure quotes match
matches = list(map(lambda x: x[2:4], matches))
if not matches:
return None, None
# Get meta refresh with shortest delay
# '.'.join(delay_str.split('.')[:2]) is used to handle the weird case that the delay is 1.1.1 or something with multiple decimal places. It would convert 1.1.1 to 1.1
sorted_matches = sorted(matches, key=lambda x: (float('.'.join(x[0].split('.')[:2])), -matches.index(x)))
shortest_delay, shortest_url = sorted_matches[0]
shortest_delay = float('.'.join(shortest_delay.split('.')[:2]))
# Encode URL component
parsed_shortest_url = urlsplit(shortest_url)
encoded_path = quote(parsed_shortest_url.path)
if parsed_shortest_url.scheme == "" and parsed_shortest_url.netloc == "" and parsed_shortest_url.path in ["url=", ""]:
# If the captured meta refresh URL argument is empty (it is a refresh/reload not redirect)
return None, None
elif re.match(r'//(?![/])', shortest_url) is not None:
# If redirect is "//google.com" then it should go to http://google.com
# This is only for "//google.com", "/google.com", "///google.com", etc are all relative
# This doesn't capture "http://" because the "//" must be at the start as re.match looks at the start of the string
# It redirects using the protocol being used by the page that is redirecting it
shortest_url = f"{urlsplit(location).scheme}://{parsed_shortest_url.netloc}{encoded_path}{"?" if parsed_shortest_url.query != "" else ""}{parsed_shortest_url.query}"
elif parsed_shortest_url.scheme == "":
# If the redirect is relative then calculate the full url
shortest_url = urljoin(location, f"{parsed_shortest_url.netloc}{encoded_path}{"?" if parsed_shortest_url.query != "" else ""}{parsed_shortest_url.query}")
else:
# Otherwise get the URL being redirected to
# lstrip is used to handle https:/// or http:/// cases and remove the extra / from the scheme
shortest_url = f"{parsed_shortest_url.scheme}://{parsed_shortest_url.netloc}{encoded_path.lstrip("/")}{"?" if parsed_shortest_url.query != "" else ""}{parsed_shortest_url.query}"
return shortest_delay, shortest_url
# Take in CLI arguments
parser = argparse.ArgumentParser()
parser.add_argument("--url", "-u", help="URL to be analysed. Should include https:// or http://.", required=True)
parser.add_argument("--agent", "-a", help="User agent to be used.")
args = parser.parse_args()
location = args.url
user_agent = args.agent
redirects = []
# While there is still a redirect to be processed keep processing
while location:
# Send a request to the location
try:
response = requests.get(location, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" if user_agent is None else user_agent}, allow_redirects=False)
except Exception as e:
if "Failed to resolve" in str(e):
redirects.append({"location": location, "type": "", "status_code": "Failed to resolve", "delay": ""})
else:
redirects.append({"location": location, "type": "", "status_code": "Requests error", "delay": ""})
break
# Make all header keys lowercase
headers = {header.lower(): i for header, i in response.headers.items()}
# Append dictionary with location and status for each redirect
redirect = {
"location": location,
"status_code": f"{response.status_code} {response.reason}",
}
if "location" in response.headers and response.status_code in [201, 301, 302, 303, 307, 308]:
# If redirect is done through response headers (location header)
# Add details about redirect to record
redirect["type"] = "Header"
redirect["delay"] = "0.0s"
if urlsplit(response.headers["location"]).scheme == "":
# If the redirect is relative then calculate the full URL
location = urljoin(location, response.headers["location"])
else:
# Otherwise get the URL being redirected to
location = response.headers["location"]
else:
# Search for meta refresh tags and find the one with the shortest delay
shortest_delay, shortest_location = find_shortest_meta_refresh(location, response.text)
if shortest_delay is not None:
# If there is a meta refresh tag
# Add details about redirect to record
redirect["type"] = "Meta Refresh"
redirect["delay"] = str(shortest_delay) + "s"
location = shortest_location
else:
# Otherwise there are no more redirects
redirect["type"] = ""
redirect["delay"] = ""
location = None
redirects.append(redirect)
print()
print(r" ___ ___ _____ ___ _ ___ ___ ")
print(r"| _ \ __|_ _| _ \ /_\ / __| __|")
print(r"| / _| | | | / / _ \ (__| _| ")
print(r"|_|_\___| |_| |_|_\/_/ \_\___|___|")
print("\nby barleybobs\n")
# Display table of redirects
min_lengths = {'location': 3, 'type': 4, 'status_code': 11, 'delay': 5}
lengths = {key: max(max(map(lambda x: len(str(x[key])), redirects)), min_lengths[key]) for key in redirects[0]}
print(f"╭─{"─"*lengths["location"]}─┬─{"─"*lengths["type"]}─┬─{"─"*lengths["status_code"]}─┬─{"─"*lengths["delay"]}─╮")
print(f"\x1b[1m│ URL{" "*(lengths['location'] - 3)} │ Type{" "*(lengths['type'] - 4)} │ Status Code{" "*(lengths['status_code'] - 11)} │ Delay{" "*(lengths["delay"] - 5)} │\x1b[0m")
print(f"├─{"─"*lengths["location"]}─┼─{"─"*lengths["type"]}─┼─{"─"*lengths["status_code"]}─┼─{"─"*lengths["delay"]}─┤")
for redirect in redirects:
print(f"│ {redirect["location"]}{" "*(lengths["location"] - len(redirect["location"]))} │ {redirect["type"]}{" "*(lengths["type"] - len(redirect["type"]))} │ {redirect["status_code"]}{" "*(lengths["status_code"] - len(redirect["status_code"]))} │ {redirect["delay"]}{" "*(lengths["delay"] - len(redirect["delay"]))} │")
print(f"╰─{"─"*lengths["location"]}─┴─{"─"*lengths["type"]}─┴─{"─"*lengths["status_code"]}─┴─{"─"*lengths["delay"]}─╯")
print()