Source code for muxpack.bipartite

from pathlib import Path
from ibis import Table
from . import io



[docs]
class Bipartite:
    """
    Lazy Bipartite storage
    - sort on role_src, role_dst
    """

    edges: Table
    role_src: str
    role_dst: str
    relationtype: str


[docs]
    def __init__(
        self,
        edges: Table,
        role_src: str = "src",
        role_dst: str = "dst",
        relationtype: str = "relationtype",
    ):
        """
        Initialize a bipartite graph with the given edges table and role labels.

        Args:
            - edges: table containing the bipartite edges.
            - role_src: column name for the source role.
            - role_dst: column name for the destination role.
            - relationtype: column name for the relation type.
        """
        self.edges = edges
        self.role_src = role_src
        self.role_dst = role_dst
        self.relationtype = relationtype



[docs]
    def project_to_src(self) -> Table:
        """
        Project the bipartite graph onto the source role, producing a unipartite edge table.
        Two source nodes are connected if they share a common destination node.

        Returns:
            - Table with columns ``src``, ``dst``, and ``relationtype``.
        """
        E = self.edges
        # TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
        # an other relation with the same role_dst. Simplifying that right now
        E_src = E.select(
            src=self.role_src, p=self.role_dst, relationtype=self.relationtype
        )
        E_dst = E.select(dst=self.role_src, p=self.role_dst)

        E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
        E = E.filter(E.src != E.dst)
        E = E.select(["src", "dst", "relationtype"])
        return E



[docs]
    def project_to_dst(self) -> Table:
        """
        Project the bipartite graph onto the destination role, producing a unipartite edge table.
        Two destination nodes are connected if they share a common source node.

        Returns:
            - Table with columns ``src``, ``dst``, and ``relationtype``.
        """
        E = self.edges
        # TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
        # an other relation with the same role_dst. Simplifying that right now
        # should we sort on role_src and role_dst or the other way around? For projection it does not matter, but for storage it does. We sort on role_src and role_dst for efficient projection, but that means that the projection to dst is less efficient. Maybe we should sort on role_dst and role_src instead?
        E_src = E.select(
            src=self.role_dst, p=self.role_src, relationtype=self.relationtype
        )
        E_dst = E.select(dst=self.role_dst, p=self.role_src)

        E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
        E = E.filter(E.src != E.dst)
        E = E.select(["src", "dst", "relationtype"])
        return E



[docs]
    def save(self, dir: Path | str) -> None:
        """
        Save the bipartite graph to disk.
        Edges are saved as a Parquet file and metadata (``role_src``, ``role_dst``,
        ``relationtype``) as a JSON file. The ``edges`` property is updated to point
        at the saved file.

        Args:
            - dir: path to the directory where the BiPartite graph will be saved.
        """
        io.save_bipartite(
            edges=self.edges,
            role_src=self.role_src,
            role_dst=self.role_dst,
            relationtype=self.relationtype,
            dir=dir,
        )
        bp = io.read_bipartite(dir=dir)
        self.edges = bp.edges