I want to display the tags that are allowed by the robots.txt. for example: www.google/search
Now, the above tag can be found in the robots.txt file. But some websites disallow every tag and have a single "Allow: /" tag in the end. for example: www.chegg.com
I want the code to distinguish between Allow: / and Allow: /search
import urllib.request
import io
import validators
def check_robots_txt(url):
if url.startswith("https://"):
url = url
else:
url = "https://" + url
validation = validators.url(url)
if validation:
if url.endswith("/"):
path = url
else:
path = url + "/"
try:
req = urllib.request.urlopen(path + "robots.txt")
data = io.TextIOWrapper(req, encoding="utf-8")
for statement in data.read().splitlines():
if statement.startswith("Allow"):
#print(statement)
print(url +statement[7:])
lines = data.read().splitlines()
**allow_found = False
for line in lines:
if line.strip().startswith("Allow"):
if line.strip() == "Allow:":
allow_found = False
break
elif line.strip() == "Allow: /":
allow_found = True
if allow_found:
print("Scanning not possible")
else:
print("Scanning...")**
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}")
except Exception as e:
print(f"An error occurred: {str(e)}")
else:
print("URL is invalid")
url = input("Enter a URL: ")
check_robots_txt(url)
This is the code and please check the part which has been marked with the asterisks.