What is the right way to refer bazel data files in python?

7.4k Views Asked by At

Suppose I have the following BUILD file

py_library(
  name = "foo",
  src = ["foo.py"],
  data = ["//bar:data.json"],
)

How should I refer to the data.json in foo.py file? I wanted to have something like below, what should I use for some_path?

with open(os.path.join(some_path, "bar/data.json"), 'r') as fp:
    data = json.load(fp)

I couldn't find much general documentation about *.runfiles online -- any pointer will be appreciated!

3

There are 3 best solutions below

0
On

For the benefit of others who find there way here, the canonical solution is to use the provided "runfiles" library provided by the rules_python package for bazel.

In my environment, it looks roughly like this:

from rules_python.python.runfiles import runfiles

resource = "workspace/bar/data.json"
r = runfiles.Create()
path = r.Rlocation(resource)

More documentation can be found here:

0
On

Here is a function that should return the path to the runfiles root for any py_binary in all the cases that I'm aware of:

import os
import re

def find_runfiles():
    """Find the runfiles tree (useful when _not_ run from a zip file)"""
    # Follow symlinks, looking for my module space
    stub_filename = os.path.abspath(sys.argv[0])
    while True:
        # Found it?
        module_space = stub_filename + '.runfiles'
        if os.path.isdir(module_space):
            break

        runfiles_pattern = r"(.*\.runfiles)"
        matchobj = re.match(runfiles_pattern, os.path.abspath(sys.argv[0]))
        if matchobj:
            module_space = matchobj.group(1)
            break

        raise RuntimeError('Cannot find .runfiles directory for %s' %
                           sys.argv[0])
    return module_space

For the example in your question you could use it like so:

with open(os.path.join(find_runfiles(), "name_of_workspace/bar/data.json"), 'r') as fp:
    data = json.load(fp)

Note that this function won't help if you build zipped executables of your python apps (using subpar, probably); for those you will need some more code. This next snippet includes get_resource_filename() and get_resource_directory(), which will work for both regular py_binary and .par binaries:

import atexit
import os
import re
import shutil
import sys
import tempfile
import zipfile


 def get_resource_filename(path):
    zip_path = get_zip_path(sys.modules.get("__main__").__file__)
    if zip_path:
        tmpdir = tempfile.mkdtemp()
        atexit.register(lambda: shutil.rmtree(tmpdir, ignore_errors=True))
        zf = BetterZipFile(zip_path)
        zf.extract(member=path, path=tmpdir)
        return os.path.join(tmpdir, path)
    elif os.path.exists(path):
        return path
    else:
        path_in_runfiles = os.path.join(find_runfiles(), path)
        if os.path.exists(path_in_runfiles):
            return path_in_runfiles
        else:
            raise ResourceNotFoundError


def get_resource_directory(path):
    """Find or extract an entire subtree and return its location."""
    zip_path = get_zip_path(sys.modules.get("__main__").__file__)
    if zip_path:
        tmpdir = tempfile.mkdtemp()
        atexit.register(lambda: shutil.rmtree(tmpdir, ignore_errors=True))
        zf = BetterZipFile(zip_path)
        members = []
        for fn in zf.namelist():
            if fn.startswith(path):
                members += [fn]
        zf.extractall(members=members, path=tmpdir)
        return os.path.join(tmpdir, path)
    elif os.path.exists(path):
        return path
    else:
        path_in_runfiles = os.path.join(find_runfiles(), path)
        if os.path.exists(path_in_runfiles):
            return path_in_runfiles
        else:
            raise ResourceNotFoundError


def get_zip_path(path):
    """If path is inside a zip file, return the zip file's path."""
    if path == os.path.sep:
        return None
    elif zipfile.is_zipfile(path):
        return path
    return get_zip_path(os.path.dirname(path))


class ResourceNotFoundError(RuntimeError):
    pass

def find_runfiles():
    """Find the runfiles tree (useful when _not_ run from a zip file)"""
    # Follow symlinks, looking for my module space
    stub_filename = os.path.abspath(sys.argv[0])
    while True:
        # Found it?
        module_space = stub_filename + '.runfiles'
        if os.path.isdir(module_space):
            break

        runfiles_pattern = r"(.*\.runfiles)"
        matchobj = re.match(runfiles_pattern, os.path.abspath(sys.argv[0]))
        if matchobj:
            module_space = matchobj.group(1)
            break

        raise RuntimeError('Cannot find .runfiles directory for %s' %
                           sys.argv[0])
    return module_space


class BetterZipFile(zipfile.ZipFile):
    """Shim around ZipFile that preserves permissions on extract."""

    def extract(self, member, path=None, pwd=None):

        if not isinstance(member, zipfile.ZipInfo):
            member = self.getinfo(member)

        if path is None:
            path = os.getcwd()

        ret_val = self._extract_member(member, path, pwd)
        attr = member.external_attr >> 16
        os.chmod(ret_val, attr)
        return ret_val

Using this second code snippet, your example would look like:

with open(get_resource_filename("name_of_workspace/bar/data.json"), 'r') as fp:
    data = json.load(fp)
0
On

Short answer: os.path.dirname(__file__)

Here is the full example:

$ ls
bar/  BUILD  foo.py  WORKSPACE

$ cat BUILD
py_binary(
    name = "foo",
    srcs = ["foo.py"],
    data = ["//bar:data.json"],
)

$ cat foo.py
import json
import os

ws = os.path.dirname(__file__)
with open(os.path.join(ws, "bar/data.json"), 'r') as fp:
  print(json.load(fp))

$ cat bar/BUILD
exports_files(["data.json"])

$ bazel run :foo

Edit: it doesn't work well when your package is in a subdirectory. You may need to go back using os.path.dirname.