Source code for timelink.pandas.attribute_values

"""
Create a dataframe with the values of an attribute
"""

import pandas as pd

from sqlalchemy import select, func, and_, desc
from sqlalchemy.orm import Session

from timelink.api.database import TimelinkDatabase
import warnings



[docs]
def attribute_values(
    the_type,
    attr_type=None,
    groupname=None,
    dates_between=None,
    db: TimelinkDatabase | None = None,
    session=None,
    sql_echo=False,
):
    """Return the vocabulary of an attribute

    The returned dataframe has a row for each unique value
    a 'count' with the number of different entities, and
    the the first and last date for that row

    Args:
        the_type = attribute type to search for
        attr_type = alians for the_type, deprecated
        db = database connection to use, either db or session must be specified
        groupname = groupname to filter by (str or list), if None all groups counted
        db = database to use
        session = database session to use, if None will use db.session()
        dates_between = tuple with two dates in format yyyy-mm-dd
        sql_echo = if true will print the sql statement


    To filter by dates: dates_in = (from_date,to_date)
    with dates in format yyyy-mm-dd
    will return attributes with
    from_date < date < to_date

    """
    if the_type is None and attr_type is not None:
        warnings.warn(
            "The 'attr_type' parameter is deprecated. Use 'the_type' instead.",
            DeprecationWarning,
            stacklevel=2
        )
        the_type = attr_type

    if the_type is None:
        raise ValueError("the_type parameter is required")
    #  We try to use an existing connection and table introspection
    # to avoid extra parameters and going to database too much
    dbsystem: TimelinkDatabase | None = None
    if db is not None:  # try if we have a db connection in the parameters
        dbsystem = db
    else:
        raise ValueError("db parameter is required")

    attr_table = db._create_eattribute_view()
    entities_table = db.get_table("entity")

    if dates_between is not None:
        first_date, last_date = dates_between
        stmt = select(
            attr_table.c.the_value.label("value"),
            func.count(attr_table.c.entity.distinct()).label("count"),
            func.min(attr_table.c.the_date).label("date_min"),
            func.max(attr_table.c.the_date).label("date_max"),
        ).where(
            and_(
                attr_table.c.the_type == the_type,
                attr_table.c.the_date > first_date.strip("-"),
                attr_table.c.the_date < last_date.strip("-"),
            )
        )

    else:
        stmt = select(
            attr_table.c.the_value.label("value"),
            func.count(attr_table.c.entity.distinct()).label("count"),
            func.min(attr_table.c.the_date).label("date_min"),
            func.max(attr_table.c.the_date).label("date_max"),
        ).where(attr_table.c.the_type == the_type)

    if groupname is not None:
        if isinstance(groupname, list):
            stmt = stmt.where(
                select(entities_table.c.id)
                .where(
                    and_(
                        entities_table.c.id == attr_table.c.entity,
                        entities_table.c.groupname.in_(groupname),
                    )
                )
                .exists()
            )

        else:
            # where entity exists in entities with groupname = groupname
            stmt = stmt.where(
                select(entities_table.c.id)
                .where(
                    and_(
                        entities_table.c.id == attr_table.c.entity,
                        entities_table.c.groupname == groupname,
                    )
                )
                .exists()
            )

    stmt = stmt.group_by(attr_table.c.the_value).order_by(desc("count"))

    if sql_echo:
        print(stmt)

    mysession: Session
    if session is None:
        mysession = dbsystem.session()
    else:
        mysession = session
    with mysession:
        records = mysession.execute(stmt)

    df = pd.DataFrame.from_records(
        records, index=["value"], columns=["value", "count", "date_min", "date_max"]
    )

    return df