Fastest String Filtering Algorithm

674 Views Asked by At

I have 5,000,000 unordered strings formatted this way (Name.Name.Day-Month-Year 24hrTime):

"John.Howard.12-11-2020 13:14"
"Diane.Barry.29-07-2020 20:50"
"Joseph.Ferns.08-05-2020 08:02"
"Joseph.Ferns.02-03-2020 05:09"
"Josephine.Fernie.01-01-2020 07:20"
"Alex.Alexander.06-06-2020 10:10"
"Howard.Jennings.07-07-2020 13:17"
"Hannah.Johnson.08-08-2020 00:49"
...

What is the fastest way to find all strings having a time t between some n and m? (i.e. fastest way to remove all strings whose time < n || m < time)

This filtering will be done multiple times with different ranges. Time ranges must always be on the same day and the starting time is always earlier than the end time.

In java, heres's my current approach given some time string M and N and a 5 million string list:

ArrayList<String> finalSolution = new ArrayList<>();

String[] startingMtimeArr = m.split(":");
String[] startingNtimeArr = n.split(":");
Integer startingMhour = Integer.parseInt(startingMtimeArr[0]);
Integer startingMminute = Integer.parseInt(startingMtimeArr[1]);
Integer endingNhour = Integer.parseInt(startingNtimeArr[0]);
Integer endingNminute = Integer.parseInt(startingNtimeArr[1]);

for combinedString in ArraySizeOf5Million{
  String[] arr = combinedString.split(".");
  String[] subArr = arr[2].split(" ");
  String[] timeArr = subArr[1].split(":");
  String hour = timeArr[0];
  String minute = timeArr[1];

   If hour >= startingMhour 
        && minute >= startingMminute 
        && hour <= endingNhour 
        && minute <= endingNminute {
    finalSolution.add(hour)
   } 
}

Java's my native language but any other languages work too. Better/faster logic is what I am after

3

There are 3 best solutions below

4
On

Some example in Python using index for every minute:

from pprint import pprint
from itertools import groupby

big_list = [
    "John.Howard.12:14",
    "Diane.Barry.13:50",
    "xxxDiane.Barryxxx.13:50",  # <-- added a name in the same HH:MM
    "Joseph.Ferns.08:02",
    "Joseph.Ferns.05:09",
    "Josephine.Fernie.07:20",
    "Alex.Alexander.10:10",
    "Howard.Jennings.12:17",
    "Hannah.Johnson.00:49",
]

# 1. sort the list by time HH:MM
big_list = sorted(big_list, key=lambda k: k[-5:])

# the list is now:

# ['Hannah.Johnson.00:49',
#  'Joseph.Ferns.05:09',
#  'Josephine.Fernie.07:20',
#  'Joseph.Ferns.08:02',
#  'Alex.Alexander.10:10',
#  'John.Howard.12:14',
#  'Howard.Jennings.12:17',
#  'Diane.Barry.13:50',
#  'xxxDiane.Barryxxx.13:50']

# 2. create an index (for every minute in a day)
index = {}

times = []
for i, item in enumerate(big_list):
    times.append(int(item[-5:-3]) * 60 + int(item[-2:]))

last = 0
cnt = 0
for v, g in groupby(times):
    for i in range(last, v):
        index[i] = [cnt, cnt]
    s = sum(1 for _ in g)
    index[v] = [cnt, cnt + s]
    cnt += s
    last = v + 1

for i in range(last, 60 * 24):
    index[i] = [cnt, cnt]


# 3. you can now do a fast query using the index
def find_all_strings(n, m):
    n = int(n[-5:-3]) * 60 + int(n[-2:])
    m = int(m[-5:-3]) * 60 + int(m[-2:])

    return big_list[index[n][0] : index[m][1]]


print(find_all_strings("00:10", "00:30"))  # []
print(find_all_strings("00:30", "00:50"))  # ['Hannah.Johnson.00:49']
print(find_all_strings("12:00", "13:55"))  # ['John.Howard.12:14', 'Howard.Jennings.12:17', 'Diane.Barry.13:50', 'xxxDiane.Barryxxx.13:50']
print(find_all_strings("13:00", "13:55"))  # ['Diane.Barry.13:50', 'xxxDiane.Barryxxx.13:50']
print(find_all_strings("15:00", "23:00"))  # []
0
On

Since the data will be searched many times I first parse the strings to make it easy fo search multiple times = see by_date.

I use binary search to find the first string of a particular day then iterate through increasing times collecting appropriate strings in variable filtered of function strings_between.

# -*- coding: utf-8 -*-
"""
https://stackoverflow.com/questions/67562250/fastest-string-filtering-algorithm

Created on Tue May 18 09:20:11 2021

@author: Paddy3118
"""

strings = """\
John.Howard.12-11-2020 13:14
Diane.Barry.29-07-2020 20:50
Joseph.Ferns.08-05-2020 08:02
Joseph.Ferns.02-03-2020 05:09
Josephine.Fernie.01-01-2020 07:20
Alex.Alexander.06-06-2020 10:10
Howard.Jennings.07-07-2020 13:17
Hannah.Johnson.08-08-2020 00:49
Josephine.Fernie.08-08-2020 07:20
Alex.Alexander.08-08-2020 10:10
Howard.Jennings.08-08-2020 13:17
Hannah.Johnson.08-08-2020 09:49\
"""

## First parse the date information once for all future range calcs

def to_mins(hr_mn='00:00'):
    hr, mn = hr_mn.split(':')
    return int(hr) * 60 + int(mn)


by_date = dict()    # Keys are individual days, values are time-sorted
for s in strings.split('\n'):
    name_day, time = s.strip().split()
    name, day = name_day.rsplit('.', 1)
    minutes = to_mins(time)
    if day not in by_date:
        by_date[day] = [(minutes, s)]
    else:
        by_date[day].append((minutes, s))
for day_info in by_date.values():
    day_info.sort()


## Now rely on dict search for day then binary +linear search within day.

def _bisect_left(a, x):
    """Return the index where to insert item x in list a, assuming a is sorted.
    The return value i is such that all e in a[:i] have e < x, and all e in
    a[i:] have e >= x.  So if x already appears in the list, a.insert(x) will
    insert just before the leftmost x already there.

    'a' is a list of tuples whose first item is assumed sorted and searched apon.
    """

    lo, hi = 0, len(a)
    while lo < hi:
        mid = (lo+hi)//2
        # Use __lt__ to match the logic in list.sort() and in heapq
        if a[mid][0] < x: lo = mid+1
        else: hi = mid
    return lo


def strings_between(day="01-01-2020", start="00:00", finish="23:59"):
    global by_date

    if day not in by_date:
        return []
    day_data = by_date[day]
    start, finish = to_mins(start), to_mins(finish)
    from_index = _bisect_left(day_data, start)

    filtered = []
    for time, s in day_data[from_index:]:
        if time <= finish:
            filtered.append(s)
        else:
            break
    return filtered


## Example data

assert by_date == {
 '12-11-2020': [(794, 'John.Howard.12-11-2020 13:14')],
 '29-07-2020': [(1250, 'Diane.Barry.29-07-2020 20:50')],
 '08-05-2020': [(482, 'Joseph.Ferns.08-05-2020 08:02')],
 '02-03-2020': [(309, 'Joseph.Ferns.02-03-2020 05:09')],
 '01-01-2020': [(440, 'Josephine.Fernie.01-01-2020 07:20')],
 '06-06-2020': [(610, 'Alex.Alexander.06-06-2020 10:10')],
 '07-07-2020': [(797, 'Howard.Jennings.07-07-2020 13:17')],
 '08-08-2020': [(49, 'Hannah.Johnson.08-08-2020 00:49'),
                (440, 'Josephine.Fernie.08-08-2020 07:20'),
                (589, 'Hannah.Johnson.08-08-2020 09:49'),
                (610, 'Alex.Alexander.08-08-2020 10:10'),
                (797, 'Howard.Jennings.08-08-2020 13:17')]}

## Example queries from command line
"""
In [7]: strings_between('08-08-2020')
Out[7]:
['Hannah.Johnson.08-08-2020 00:49',
 'Josephine.Fernie.08-08-2020 07:20',
 'Hannah.Johnson.08-08-2020 09:49',
 'Alex.Alexander.08-08-2020 10:10',
 'Howard.Jennings.08-08-2020 13:17']

In [8]: strings_between('08-08-2020', '09:30', '24:00')
Out[8]:
['Hannah.Johnson.08-08-2020 09:49',
 'Alex.Alexander.08-08-2020 10:10',
 'Howard.Jennings.08-08-2020 13:17']

In [9]: strings_between('08-08-2020', '09:49', '10:10')
Out[9]: ['Hannah.Johnson.08-08-2020 09:49', 'Alex.Alexander.08-08-2020 10:10']

In [10]:
"""
0
On

As @Paddy3118 already pointed out, binary search is probably the way to go.

  1. (if your data is on disk): Load input data and sort by date/time.
  2. With i0 being the start index of the result set and i1 being the end index of the result set (both obtained from binary search): enumerate resulting entries.

The code I used (in Lisp) is shown at the end of this answer. It is not optimized in the slightest (I guess it would be possible to make the loading and initial sorting much faster with a some optimization effort).

This is how my interactive session looked like (includes timing information, for my foo.txt input file containing 5 million entries).

rlwrap sbcl --dynamic-space-size 2048
This is SBCL 2.1.1.debian, an implementation of ANSI Common Lisp. More information about SBCL is available at http://www.sbcl.org/. SBCL is free software, provided as is, with absolutely no warranty. It is mostly in the public domain; some portions are provided under BSD-style licenses. See the CREDITS and COPYING files in the distribution for more information.
(ql:quickload :cl-ppcre)
To load "cl-ppcre":
Load 1 ASDF system:
cl-ppcre
; Loading "cl-ppcre"
..
(:CL-PPCRE)
(load "fivemillion.lisp")
T
(time (defparameter data (load-input-for-queries "foo.txt")))
"sorting..."
Evaluation took:
32.091 seconds of real time
32.090620 seconds of total run time (31.386722 user, 0.703898 system)
[ Run times consist of 2.641 seconds GC time, and 29.450 seconds non-GC time. ]
100.00% CPU
15 lambdas converted
115,308,171,684 processor cycles
6,088,198,752 bytes consed
DATA
(time (defparameter output (query-interval data '(2018 1 1) '(2018 1 2))))
Evaluation took:
0.000 seconds of real time
0.000111 seconds of total run time (0.000109 user, 0.000002 system)
100.00% CPU
395,172 processor cycles
65,536 bytes consed
OUTPUT
(time (defparameter output (query-interval data '(2018 1 1) '(2018 1 2 8))))
Evaluation took:
0.000 seconds of real time
0.000113 seconds of total run time (0.000110 user, 0.000003 system)
100.00% CPU
399,420 processor cycles
65,536 bytes consed
OUTPUT
(time (defparameter output (query-interval data '(2018 1 1) '(2019 1 1))))
Evaluation took:
0.020 seconds of real time
0.022469 seconds of total run time (0.022469 user, 0.000000 system)
110.00% CPU
80,800,092 processor cycles
15,958,016 bytes consed
OUTPUT

So, while the load and sort time (done once) is nothing to write home about (but could be optimized), the (query-interval ...) calls are pretty fast. The bigger the result set of the query, the longer the list the function returns (more conses, more run time). I could have been more clever and just return the start and end indices of the result set and leave the collecting of the entries to the caller.

Here the source code, which also includes code for generating the test data sets I used:

(defun random-uppercase-character ()
  (code-char (+ (char-code #\A) (random 26))))
(defun random-lowercase-character ()
  (code-char (+ (char-code #\a) (random 26))))
(defun random-name-part (nchars)
  (with-output-to-string (stream)
    (write-char (random-uppercase-character) stream)
    (loop repeat (- nchars 1) do
      (write-char (random-lowercase-character) stream))))
(defun random-day-of-month ()
  "Assumes every month has 31 days, because it does not matter
for this exercise."
  (+ 1 (random 31)))
(defun random-month-of-year ()
  (+ 1 (random 12)))
(defun random-year ()
  "Some year between 2017 and 2022"
  (+ 2017 (random 5)))
(defun random-hour-of-day ()
  (random 24))
(defun random-minute-of-hour ()
  (random 60))
(defun random-entry (stream)
  (format stream "\"~a.~a.~d-~d-~d ~d:~d\"~%"
      (random-name-part 10)
      (random-name-part 10)
      (random-day-of-month)
      (random-month-of-year)
      (random-year)
      (random-hour-of-day)
      (random-minute-of-hour)))
(defun generate-input (entry-count file-name)
  (with-open-file (stream
           file-name
           :direction :output
           :if-exists :supersede)
    (loop repeat entry-count do
      (random-entry stream))))

(defparameter *line-scanner*
  (ppcre:create-scanner
   "\"(\\w+).(\\w+).(\\d+)-(\\d+)-(\\d+)\\s(\\d+):(\\d+)\""))
;;      0       1      2      3      4        5      6
;;      fname   lname  day    month  year     hour   minute

(defun decompose-line (line)
  (let ((parts (nth-value
        1
        (ppcre:scan-to-strings
         *line-scanner*
         line))))
    (make-array 7 :initial-contents
        (list (aref parts 0)
              (aref parts 1)
              (parse-integer (aref parts 2))
              (parse-integer (aref parts 3))
              (parse-integer (aref parts 4))
              (parse-integer (aref parts 5))
              (parse-integer (aref parts 6))))))
(defconstant +fname-index+ 0)
(defconstant +lname-index+ 1)
(defconstant +day-index+ 2)
(defconstant +month-index+ 3)
(defconstant +year-index+ 4)
(defconstant +hour-index+ 5)
(defconstant +minute-index+ 6)
(defvar *compare-<-criteria*
  (make-array 5 :initial-contents
          (list +year-index+
            +month-index+
            +day-index+
            +hour-index+
            +minute-index+)))

(defun compare-< (dl1 dl2)
  (labels ((comp (i)
         (if (= i 5)
         nil
         (let ((index (aref *compare-<-criteria* i)))
           (let ((v1 (aref dl1 index))
             (v2 (aref dl2 index)))
             (cond
               ((< v1 v2) t)
               ((= v1 v2) (comp (+ i 1)))
               (t nil)))))))
    (comp 0)))
           
(defun time-stamp-to-index (hours minutes)
  (+ minutes (* 60 hours)))

(defun load-input-for-queries (file-name)
  (let* ((decomposed-line-list
       (with-open-file (stream file-name :direction :input)
         (loop for line = (read-line stream nil nil)
           while line
           collect (decompose-line line))))
     (number-of-lines (length decomposed-line-list))
     (decomposed-line-array (make-array number-of-lines
                        :initial-contents
                        decomposed-line-list)))
    (print "sorting...") (terpri)
    (sort decomposed-line-array #'compare-<)))

(defun unify-date-list (date)
  (let ((date-length (length date)))
    (loop
      for i below 5
      collecting (if (> date-length i) (nth i date) 0))))

(defun decomposed-line-date<date-list (decomposed-line date-list)
  (labels ((comp (i)
         (if (= i 5)
         nil
         (let ((index (aref *compare-<-criteria* i)))
           (let ((v1 (aref decomposed-line index))
             (v2 (nth i date-list)))
             (cond
               ((< v1 v2) t)
               ((= v1 v2) (comp (+ i 1)))
               (t nil)))))))
    (comp 0)))

(defun index-before (data key predicate
             &key (left 0) (right (length data)))
  (if (and (< left right) (> (- right left) 1))
      (if (funcall predicate (aref data left) key)
      (let ((mid (+ left (floor (- right left) 2))))
        (if (funcall predicate (aref data mid) key)
        (index-before data key predicate
                  :left mid
                  :right right)
        (index-before data key predicate
                  :left left
                  :right mid)))
      left)
      right))

(defun query-interval (data start-date end-date)
  "start-date and end-date are given as lists of the form:
'(year month day hour minute) or shorter versions e.g.
'(year month day hour), omitting trailing values which will be
appropriately defaulted."
  (let ((d0 (unify-date-list start-date))
    (d1 (unify-date-list end-date)))
    (let* ((start-index (index-before
             data
             d0
             #'decomposed-line-date<date-list))
       (end-index (index-before
               data
               d1
               #'decomposed-line-date<date-list
               :left (cond
                   ((< start-index 0) 0)
                   ((>= start-index (length data))
                (length data))
                   (t start-index)))))
      (loop for i from start-index below end-index
        collecting (aref data i)))))