python 收集字符串的所有唯一子串的有效方法

我需要确定一个字符串中的所有子字符串具有最小的大小和重复。需要注意的是，我不希望返回的子字符串本身是其他返回的子字符串的子字符串。换句话说，子串的集合需要是不相交的集合。下面的函数可以工作，但效率很低。97.9%的时间花在重新计算后缀数组和LCP上。我之所以这样做，是因为在我从字符串中删除最后一个子串并重新计算SA和LCP之后，我可以保证最后一个子串的任何子串都不会被添加。是否有更有效的方法来实现这一点，需要计算一次SA和LCP？

from typing import Dict, List, NamedTuple
import numpy as np
import pandas as pd
from pydivsufsort import divsufsort, kasai

class Substring(NamedTuple):
    length: int
    count: int

def find_unique_repeat_substrings(s: bytes, min_length: int = 20, min_repeats: int = 10) -> Dict[str, Substring]:

    string_dict = dict()
    K = len(s)

    while K>=min_length:
        sa = divsufsort(s)
        lcp = kasai(s, sa)
        K_loc = np.argmax(lcp)
        K=np.max(lcp)
        #calculate number of repeats
        loc = K_loc+1
        while lcp[loc]==K:
            loc += 1
        cnt = loc-K_loc+1
        longest_string = s[sa[K_loc]:sa[K_loc]+K]
        
        #add substring to dict
        if cnt >= min_repeats and K>=min_length:
            string_dict[longest_string.decode()] = Substring(length=K, count=cnt)
        
        #remove substring
        s = s.replace(longest_string, b"")  # Replacing with bytes
    return(string_dict)

s = "this string is repeated three times in this sentence. string string.".encode()
string_dict = find_unique_repeat_substrings(s,min_length = 4, min_repeats=2)
string_dict

{' string '：Substring（length=8，count=2），'this'：String（length=4，count=2）}

在回答了@stef之后，我想我有了一个更好的方向。我很确定这可以改进，但它比原来快得多，可能是O（n log n）。

import numpy as np
from pydivsufsort import divsufsort, kasai
from intervaltree import Interval, IntervalTree

def update_ranges(ranges_tree: IntervalTree, new_range: tuple):
    """
    This function updates the ranges in the given interval tree.
    It removes any intervals from the tree that are fully contained within the new interval.
    If the new interval is not fully covered by any existing intervals in the tree, it adds the new interval.

    :param ranges_tree: The interval tree to update.
    :param new_range: The new interval.
    :return: The updated interval tree.
    """
    new_interval = Interval(new_range[0], new_range[1])
    
    overlapping_intervals = ranges_tree[new_interval.begin:new_interval.end]  # Find overlapping intervals.
    
    # Remove intervals from the IntervalTree that are fully contained within new_interval.
    for interval in overlapping_intervals:
        if interval.begin >= new_interval.begin and interval.end <= new_interval.end:
            ranges_tree.remove(interval)

    # If new_interval is not fully covered by any existing intervals, add it to the IntervalTree.
    if not any(interval.begin <= new_interval.begin and interval.end >= new_interval.end for interval in overlapping_intervals):
        ranges_tree.add(new_interval)

    return ranges_tree  # Return updated IntervalTree.

def find_unique_repeat_substrings(s: bytes, min_length: int = 4, min_repeats: int = 2):
    """
    This function finds unique repeated substrings of a specified minimum length and number of repetitions in the given string.

    :param s: The input string.
    :param min_length: The minimum length of substrings to consider.
    :param min_repeats: The minimum number of repetitions of substrings.
    :return: A list of unique repeated substrings that meet the conditions.
    """
    if len(s) <= 1:
        return []
        
    sa = divsufsort(s)
    lcp = kasai(s, sa)

    zip_array = np.column_stack((sa, np.roll(lcp, 1, axis=0)))  # Roll lcp 1 position and stack with SA.

    active_string = False
    cnt = 1
    string_ranges = IntervalTree()
    max_current_str_len = 0

    for row in zip_array:
        # Evaluating active string; if lcp value is descending, exit active substring.
        if active_string and row[1] < max_current_str_len:
            if cnt >= min_repeats:  # If substring repeated enough times, update string ranges.
                string_ranges = update_ranges(string_ranges, (start_str, start_str + str_len ))
            active_string = False
            cnt = 1
            max_current_str_len = 0
        # Evaluating active substring; if lcp value is non-descending, increment repeat count.
        elif active_string and row[1] >= max_current_str_len:
            cnt += 1
            max_current_str_len = max(max_current_str_len,row[1])
        # New substring found of at least size min_length; start tracking it.
        elif row[1] >= min_length and not active_string:
            start_str = row[0]
            str_len = row[1]
            active_string = True
            cnt += 1
            max_current_str_len = row[1]
    
    # If a new substring was being tracked at the end of the SA/LCP, update string ranges.
    if cnt >= min_repeats:
        string_ranges = update_ranges(string_ranges, (start_str, start_str + str_len ))

    # Convert ranges to strings
    # Sorted is used to maintain a consistent order of the strings based on their starting position
    strings = sorted([s[x.begin:x.end] for x in string_ranges])
    
    return strings  # Return a list of unique repeated substrings that meet the conditions.

import unittest

class TestFindUniqueRepeatSubstrings(unittest.TestCase):
    def test_find_unique_repeat_substrings(self):
        test_cases = [
            {
                's': "this string is repeated three times in this sentence. string string .".encode(),
                'min_length': 4,
                'min_repeats': 2,
                'expected_result': [b' string ', b'this s'],
            },
            {
                's': "s st str stri strin string".encode(),
                'min_length': 4,
                'min_repeats': 2,
                'expected_result': [b' str', b'stri', b'trin'],
            },
            {
                's': "banana".encode(),
                'min_length': 2,
                'min_repeats': 2,
                'expected_result': [b'ana'],
            },
            {
                's': "".encode(),
                'min_length': 3,
                'min_repeats': 2,
                'expected_result': [],
            },
            {
                's': "a".encode(),
                'min_length': 1,
                'min_repeats': 1,
                'expected_result': [],
            },
                        {
                's': "aa".encode(),
                'min_length': 1,
                'min_repeats': 2,
                'expected_result': [b'a'],
            },
        ]

        for case in test_cases:
            s = case['s']
            min_length = case['min_length']
            min_repeats = case['min_repeats']
            expected_result = case['expected_result']

            try:
                result = find_unique_repeat_substrings(s, min_length, min_repeats)
                self.assertEqual(result, expected_result)
            except AssertionError as e:
                error_msg = f"AssertionError in test case:\n" \
                            f"s: {s}\n" \
                            f"min_length: {min_length}\n" \
                            f"min_repeats: {min_repeats}\n" \
                            f"Expected: {expected_result}\n" \
                            f"Result: {result}\n"
                raise AssertionError(error_msg) from e

suite = unittest.TestLoader().loadTestsFromTestCase(TestFindUniqueRepeatSubstrings)
runner = unittest.TextTestRunner()
runner.run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 0.007s

OK

python 收集字符串的所有唯一子串的有效方法

1条答案

相关问题

热门标签

最新问答