from pathlib import Path
from ibis import Table
from . import io
[docs]
class Bipartite:
"""
Lazy Bipartite storage
- sort on role_src, role_dst
"""
edges: Table
role_src: str
role_dst: str
relationtype: str
[docs]
def __init__(
self,
edges: Table,
role_src: str = "src",
role_dst: str = "dst",
relationtype: str = "relationtype",
):
"""
Initialize a bipartite graph with the given edges table and role labels.
Args:
- edges: table containing the bipartite edges.
- role_src: column name for the source role.
- role_dst: column name for the destination role.
- relationtype: column name for the relation type.
"""
self.edges = edges
self.role_src = role_src
self.role_dst = role_dst
self.relationtype = relationtype
[docs]
def project_to_src(self) -> Table:
"""
Project the bipartite graph onto the source role, producing a unipartite edge table.
Two source nodes are connected if they share a common destination node.
Returns:
- Table with columns ``src``, ``dst``, and ``relationtype``.
"""
E = self.edges
# TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
# an other relation with the same role_dst. Simplifying that right now
E_src = E.select(
src=self.role_src, p=self.role_dst, relationtype=self.relationtype
)
E_dst = E.select(dst=self.role_src, p=self.role_dst)
E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
E = E.filter(E.src != E.dst)
E = E.select(["src", "dst", "relationtype"])
return E
[docs]
def project_to_dst(self) -> Table:
"""
Project the bipartite graph onto the destination role, producing a unipartite edge table.
Two destination nodes are connected if they share a common source node.
Returns:
- Table with columns ``src``, ``dst``, and ``relationtype``.
"""
E = self.edges
# TODO this is a explicit choice: relationtypes could be more complex, ie when two different role_src have
# an other relation with the same role_dst. Simplifying that right now
# should we sort on role_src and role_dst or the other way around? For projection it does not matter, but for storage it does. We sort on role_src and role_dst for efficient projection, but that means that the projection to dst is less efficient. Maybe we should sort on role_dst and role_src instead?
E_src = E.select(
src=self.role_dst, p=self.role_src, relationtype=self.relationtype
)
E_dst = E.select(dst=self.role_dst, p=self.role_src)
E = E_src.inner_join(E_dst, E_src.p == E_dst.p)
E = E.filter(E.src != E.dst)
E = E.select(["src", "dst", "relationtype"])
return E
[docs]
def save(self, dir: Path | str) -> None:
"""
Save the bipartite graph to disk.
Edges are saved as a Parquet file and metadata (``role_src``, ``role_dst``,
``relationtype``) as a JSON file. The ``edges`` property is updated to point
at the saved file.
Args:
- dir: path to the directory where the BiPartite graph will be saved.
"""
io.save_bipartite(
edges=self.edges,
role_src=self.role_src,
role_dst=self.role_dst,
relationtype=self.relationtype,
dir=dir,
)
bp = io.read_bipartite(dir=dir)
self.edges = bp.edges