Get page id of a confluence page from its public url

4.8k Views Asked by At

How to get a Confluence page_id given a page_url. For Eg:

If this is the Display URL: https://confluence.som.yale.edu/display/SC/Finding+the+Page+ID+of+a+Confluence+Page

I want to get its page_id using Confluence REST API

More details here

2

There are 2 best solutions below

3
On BEST ANSWER

Do you use atlassian-python-api?

In that case you can parse your url to get the confluence space (SC) and page title (Finding the Page ID of a Confluence Page) then use confluence.get_page_id(space, title).

from atlassian import Confluence

page_url = "https://confluence.som.yale.edu/display/SC/Finding+the+Page+ID+of+a+Confluence+Page"

confluence = Confluence(
        url='https://confluence.som.yale.edu/',
        username=user,
        password=pwd)
        
space, title = page_url.split("/")[-2:]
title = title.replace("+", " ")

page_id = confluence.get_page_id(space, title)

Note that when your title contains a special character (+ or ü, ä...) your page url will already contain the id like this: https://confluence.som.yale.edu/pages/viewpage.action?pageId=1234567890 so you might want to check for it first.

EDIT: here a version of what your function could look like:

from atlassian import Confluence
import re
import urllib

# regex pattern to match pageId if already in url
page_id_in_url_pattern = re.compile(r"\?pageId=(\d+)")

def get_page_id_from_url(confluence, url):
    page_url = urllib.parse.unquote(url) #unquoting url to deal with special characters like '%'
    space, title = page_url.split("/")[-2:]

    if re.search(page_id_in_url_pattern, title):
        return re.search(page_id_in_url_pattern, title).group(1)
    
    else:
        title = title.replace("+", " ")
        return confluence.get_page_id(space, title)



if __name__ == "__main__":
    from getpass import getpass
    user = input('Login: ')
    pwd = getpass('Password: ')

    page_url = "https://confluence.som.yale.edu/display/SC/Finding+the+Page+ID+of+a+Confluence+Page"

    confluence = Confluence(
            url='https://confluence.som.yale.edu/',
            username=user,
            password=pwd)

    print(get_page_id_from_url(confluence, page_url))
0
On

Unfortunately the Atlassian Python client is very limited (e.g. there is no way to access tiny URLs from page share links). If you get an API key you could hypothetically make some REST calls to download the headers of any arbitrary page and grab the pageId from that.

If you are using the Python client however, you can only get pageId from a URL if it is 1) already in the URL or 2) you have a properly-formatted spaceKey and title in the URL.

Building on the function that Tranbi provided, here is an improved function that attempts to extract the pageId from a URL. It does not work for all URLs though, be warned.

from atlassian import Confluence
import re
import urllib


CONFLUENCE_HOSTNAME = 'confluence.som.yale.edu'

PAGEID_RE = re.compile(r"pageId=(\d+)")
SPACEKEY_RE = re.compile(r"spaceKey=([a-zA-Z0-9~]+)")
TITLE_RE = re.compile(r"title=([^#&=]+)")


def get_pageid_from_url(client, raw_url):
    scheme, netloc, path, params, query, fragment = urllib.parse.urlparse(raw_url)
    if netloc != CONFLUENCE_HOSTNAME:
        raise ValueError(f"Only Confluence URLs are supported in this script. You supplied a URL with netloc={netloc}")

    # Special handling for login redirect URLs
    fix_title = False
    if path == '/login.action':
        fix_title = True
        pretty_url = urllib.parse.unquote(raw_url) #unquoting url to deal with special characters like '%'
        scheme, netloc, path, params, query, fragment = urllib.parse.urlparse(pretty_url)

    # Get the pageId directly from the URL if available
    pageid_match = re.search(PAGEID_RE, query)
    if pageid_match:
        return pageid_match.group(1)

    # Otherwise, get the spaceKey and title from the URL, and then make a separate call to the API to get the pageId
    if path.startswith('/display/'):
        path_pieces = path.split('/')
        assert len(path_pieces) == 4, f"Expected 4 forward-slashes in path, but found {path}"
        _, _, space, title = path_pieces
        # Fix title; order of operations matters!
        title = title.replace("+", " ")
        title = urllib.parse.unquote(title, encoding='utf-8', errors='replace')
        return client.get_page_id(space, title)
    spacekey_match = re.search(SPACEKEY_RE, query)
    title_match = re.search(TITLE_RE, query)
    if spacekey_match and title_match:
        space = spacekey_match.group(1)
        # Fix title; order of operations matters!
        title = title_match.group(1)
        title = title.replace("+", " ")
        if fix_title:
            title = title.split(' - ')[0]
        title = urllib.parse.unquote(title, encoding='utf-8', errors='replace')
        return client.get_page_id(space, title)

    # Unfortunately this URL style is not supported by the Python Atlassian client :(
    raise ValueError(f"Cannot parse (pageId) or (spaceKey and title) from URL: {raw_url}")