diff --git a/catfish/catfish/CatfishSearchEngine.py b/catfish/catfish/CatfishSearchEngine.py index 4ad19cc..7ac3426 100644 --- a/catfish/catfish/CatfishSearchEngine.py +++ b/catfish/catfish/CatfishSearchEngine.py @@ -20,6 +20,7 @@ # pylint: disable=C0114 # pylint: disable=C0116 +from odf.opendocument import load as odfload import logging import io @@ -487,6 +488,12 @@ class CatfishSearchMethod_Fulltext(CatfishSearchMethod): used as a replacement for the 'find' search method, which is difficult to interrupt and is slower than os.walk.""" + openers = { + '<>': lambda fname: open(fname, "r"), + 'vnd.oasis.opendocument': lambda fname: io.StringIO(str(odfload(fname).body)), + 'text': lambda fname: open(fname, "r"), + } + def __init__(self): """Initialize the 'fulltext' search method.""" super().__init__("fulltext") @@ -581,15 +588,10 @@ class CatfishSearchMethod_Fulltext(CatfishSearchMethod): True if still running.""" self.running = True - find_keywords_backup = [] - if not self.exact: - # Split the keywords into a list if they are not already. - if isinstance(keywords, str): - keywords = keywords.replace(',', ' ').strip().split() - - for keyword in keywords: - if keyword not in find_keywords_backup: - find_keywords_backup.append(keyword) + if isinstance(keywords, str): + keywords = set(keywords.replace(',', ' ').split()) if self.exact else {keywords} + else: + keywords = {" ".join(keywords)} if self.exact else set(keywords) # Start walking the folder structure. for root, dirs, files in os.walk(path): # pylint: disable=W0612 @@ -601,48 +603,33 @@ class CatfishSearchMethod_Fulltext(CatfishSearchMethod): continue for filename in files: + if self.force_stop: break + mime = guess_type(filename)[0] or "<>" + fname = os.path.join(root, filename) + if not os.path.isfile(fname): continue try: - fullpath = os.path.join(root, filename) - - # Skip if special file. - if not os.path.isfile(fullpath): - continue - if os.path.getsize(fullpath) == 0: - continue - if fullpath.lower().endswith('.pdf'): - if self.search_pdf(fullpath, keywords): - yield fullpath - if zipfile.is_zipfile(fullpath): - yield fullpath - # Skip if not text file. - if not self.is_txt(filename): - continue - # Check character encoding, skip if binary. - charset = self.check_charset(root, filename) - if charset == 'binary': - continue - - # Check each line. If a keyword is found, yield. - open_file = open(fullpath, 'r', encoding=charset) - with open_file as file_text: - if self.search_text(file_text, keywords): - yield fullpath - # Skips on errors, move on to next in list. - except UnicodeDecodeError: - continue - except UnicodeError: - continue - except FileNotFoundError: - continue - except PermissionError: - continue - except OSError: - continue + for submime in self.openers: + if submime in mime: + opened = self.openers[submime](fname) + if self.textsearch(opened, keywords, regex): + yield fname + except (IOError, UnicodeDecodeError, ValueError): + pass yield True yield False self.force_stop = False self.running = False + def textsearch(self, stream, keywords, regex): + """Internal text search for keywords in string-oriented stream.""" + tofind, res = re.compile("|".join(keywords)), keywords.copy() + for line in stream: + if self.force_stop: break + res -= set(tofind.findall(line)) + if not res: + return True + return False + def stop(self): """Stop the running search method.""" self.force_stop = True