Scanning a robots.txt file and detecting the tags that are are allowed to scan

66 Views Asked by At

I want to display the tags that are allowed by the robots.txt. for example: www.google/search

Now, the above tag can be found in the robots.txt file. But some websites disallow every tag and have a single "Allow: /" tag in the end. for example: www.chegg.com

I want the code to distinguish between Allow: / and Allow: /search

import urllib.request
import io
import validators

def check_robots_txt(url):
    if url.startswith("https://"):
        url = url
    else:
        url = "https://" + url
    validation = validators.url(url)
    
    if validation:
        if url.endswith("/"):
            path = url
        else:
            path = url + "/"
        try:
            req = urllib.request.urlopen(path + "robots.txt")
            data = io.TextIOWrapper(req, encoding="utf-8")
            for statement in data.read().splitlines():
                if statement.startswith("Allow"):
                    #print(statement)
                    print(url +statement[7:])
            lines = data.read().splitlines()

            **allow_found = False
            for line in lines:
                if line.strip().startswith("Allow"):
                    if line.strip() == "Allow:":
                        allow_found = False
                        
                        break
                    elif line.strip() == "Allow: /":
                        allow_found = True
                    
            if allow_found:
                print("Scanning not possible")
            else:
                print("Scanning...")**

            

        except urllib.error.HTTPError as e:
            print(f"HTTP Error {e.code}: {e.reason}")
        except Exception as e:
            print(f"An error occurred: {str(e)}")

    else:
        print("URL is invalid")

url = input("Enter a URL: ")
check_robots_txt(url)

This is the code and please check the part which has been marked with the asterisks.

0

There are 0 best solutions below