Source code for patternpiece.matcher

from typing import List, Dict
import multiprocessing


[docs]class PatternPiece(object): """ Pattern piece matching class, used to call the AC automaton's Cython implementation. Multi-pattern, multi-element sequence matching. Examples -------- >>> patterns = {(1, 2, 3): 0, (4, 5, 6) : 1} # In this dict, key -> pattern, value -> index >>> automation = PatternPiece(patterns) """ def __init__(self, patterns: Dict[tuple, int], mode: str = "memory"): """ Initialize the automaton. Parameters ---------- patterns : Dict[tuple, int] A dictionary to encode the input patterns. mode : str, optional, default: 'memory' The mode for the AC automaton. It can be either 'memory' or 'speed'. In memory mode, less memory is consumed when creating the trie, but the performance decreases during the search phase. Conversely, in speed mode, the performance is better, but it consumes more memory. The default mode is memory. Raises ------ Exception If the provided `mode` is not 'memory' or 'speed'. """ from .ac_matcher_nogil import ACMatcherSpeed, ACMatcherMemory if mode == "memory": self._automaton = ACMatcherMemory(patterns) elif mode == "speed": self._automaton = ACMatcherSpeed(patterns) else: raise Exception("Error `mode`, you can only choose [`memory`, `speed`]")
[docs] def match(self, encoded: List[List[int]], num_workers: int = -1): """ Search for the added patterns in the given sequences. Parameters ---------- encoded : List[List[int]] The encoded sequences to search patterns in. num_workers : int, optional, default: -1 The number of worker processes to use. If -1, all available CPUs are used. If greater than the number of available CPUs, an exception is raised. Returns ------- list[list[tuple[int, int, int]]] A list of found patterns with their positions and indexes (Index, start, end). Examples -------- >>> patterns = {(1, 40, 500): 6} >>> automation = PatternPiece(patterns) >>> sequences = [[(1, 10, 100), (4, 40, 400), (5, 50, 500)]] >>> results = automation.match(sequences) >>> # results = [[(6, 0, 3)]] Raises ------ Exception If `num_workers` is not an integer, or exceeds the maximum number of available CPUs. ValueError If the automaton has not been built. """ # Determine workers num_cpus = multiprocessing.cpu_count() if not isinstance(num_workers, int): raise Exception("The `num_workers` need to be `int` type.") if num_workers < 0: num_workers = num_cpus elif num_workers == 0: num_workers = 1 elif num_workers > num_cpus: raise Exception("The `num_workers` has exceeded the maximum physical limits, please check.") # Parallel matching matched = self._automaton.match(encoded, num_workers) return matched