How to fully sanitise HTTP

53 Views Asked by At

I am trying to scrape a specific portion of a website's html to get some text from a "bio" section. When I run the launcher it shuts down upon loading the extension this is happening in and returns this traceback:

Traceback (most recent call last):
  File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy\__main__.py", line 39, in <module>
    cli.main()
  File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 430, in main
    run()
  File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 284, in run_file
    runpy.run_path(target, run_name="__main__")
  File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 321, in run_path
    return _run_module_code(code, init_globals, run_name,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 135, in _run_module_code
    _run_code(code, mod_globals, init_globals,
  File "{userpath}\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 124, in _run_code
    exec(code, run_globals)
  File ".\launcher.py", line 8, in <module>
    bot.run(VERSION)
  File ".\libs\bot\__init__.py", line 62, in run
    super().run(self.TOKEN, reconnect=True)
  File ".\Lib\site-packages\discord\client.py", line 860, in run
    asyncio.run(runner())
  File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\asyncio\runners.py", line 194, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\asyncio\runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "{userpath}\AppData\Local\Programs\Python\Python312\Lib\asyncio\base_events.py", line 685, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File ".\Lib\site-packages\discord\client.py", line 849, in runner        
    await self.start(token, reconnect=reconnect)
  File ".\Lib\site-packages\discord\client.py", line 777, in start
    await self.login(token)
  File ".\Lib\site-packages\discord\client.py", line 612, in login
    data = await self.http.static_login(token)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".\Lib\site-packages\discord\http.py", line 803, in static_login    
    data = await self.request(Route('GET', '/users/@me'))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".\Lib\site-packages\discord\http.py", line 625, in request
    async with self.__session.request(method, url, **kwargs) as response:
  File ".\Lib\site-packages\aiohttp\client.py", line 1194, in __aenter__   
    self._resp = await self._coro
                 ^^^^^^^^^^^^^^^^
  File ".\Lib\site-packages\aiohttp\client.py", line 603, in _request      
    resp = await req.send(conn)
           ^^^^^^^^^^^^^^^^^^^^
  File ".\Lib\site-packages\aiohttp\client_reqrep.py", line 713, in send   
    await writer.write_headers(status_line, self.headers)
  File ".\Lib\site-packages\aiohttp\http_writer.py", line 129, in write_headers
    buf = _serialize_headers(status_line, headers)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "aiohttp\\_http_writer.pyx", line 132, in aiohttp._http_writer._serialize_headers
  File "aiohttp\\_http_writer.pyx", line 116, in aiohttp._http_writer._safe_header
ValueError: Newline or carriage return character detected in HTTP status message or header. This is a potential security issue.

I've made an effort to sanitise the returned HTML before it's passed like so

from discord.ext.commands import Cog, command
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import bleach

class Scraper:
    @staticmethod
    async def fetch(url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                return await response.text()

    @staticmethod
    async def scrape(url):
        html_content = await Scraper.fetch(url)
        soup = BeautifulSoup(html_content, 'html.parser')

### SANITIZATION HERE

        cleaned_html_content = bleach.clean(str(soup), tags=['span'], attributes={'span': ['class']}) ##whitelist tags
        stripped_html_content = cleaned_html_content.replace('\n', '').replace('\r', '') ## remove that nasty stuff I hope?
        ##TODO: Check for anything else that needs sanitising for security too
        soup = BeautifulSoup(stripped_html_content, 'html.parser') ##soup it up baybee

## SANITZATION ENDS

        intros = soup.select('span.character_selfintroduction') ## grab the character bio
        intro_texts = []
        for intro in intros:
            intro_text = Scraper.extract_text_from_span(intro)
            intro_texts.append(intro_text)
        return intro_texts

    @staticmethod ## no idea how else to make this iterate on the html when it's nested
    def extract_text_from_span(span):
        text = ''
        for child in span.children:
            if isinstance(child, str):
                text += child.strip()
            elif child.name == 'span':
                text += Scraper.extract_text_from_span(child)
        return text

class xivLodestone(Cog):
    def __init__(self, bot):
        self.bot = bot

    @command(name="get_whoami") ## test command for now, just return the bio
    async def scrape_introductions(self, ctx, lodestoneID: int):
        scraper = Scraper()
        urls = ['https://eu.finalfantasyxiv.com/lodestone/character/{LodestoneID}', 
                'https://fr.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
                'https://de.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
                'https://jp.finalfantasyxiv.com/lodestone/character/{LodestoneID}',
                'https://na.finalfantasyxiv.com/lodestone/character/{LodestoneID}'
                ]  ## Added all of them as a fallback, might make that a list it iterates instead?
        intro_texts = await asyncio.gather(*(scraper.scrape(url) for url in urls))
        for intro_text_list in intro_texts:
            for intro_text in intro_text_list:
                await ctx.send(intro_text)

    @Cog.listener()
    async def on_ready(self):
        if not self.bot.ready:
            self.bot.cogs_ready.ready_up(
                "xivLodestone")

def setup(bot):
    bot.add_cog(xivLodestone(bot))

I expected the code above to have whitelisted only the html points I needed, while taking away any characters that pose a security risk.

I'm not sure what else to do in this case.

EDIT: It's been brought to my attention that I'd been focusing on HTML and not HTTP. I'll come back to this later if I can't figure it out but please forgive the newbie error.

EDIT 2: Added full traceback. cleaned the directories to remove some personal info and make them relative to the working directory or just that they're in appdata

0

There are 0 best solutions below