Source code for Katna.image_filters.text_detector

"""
.. module:: Katna.image_filters.text_detector
    :platform: OS X
    :synopsis: This module is implementation of text detector filter
"""

import os
import cv2
import numpy as np
import time
import requests
import random
from imutils.object_detection import non_max_suppression
from Katna.image_filters.filter import Filter
import Katna.config as config


[docs]class TextDetector(Filter):
    """TextDetector Class: Class for implementation of text detector filter, inherit from Filter class
    """

    def __init__(self, weight=1.0):
        """Constructor for this class does following tasks, if not already downloaded\
        , it first downloads text detector dnn weights file from public URL\
        ands save it at USER_HOME/.katna directory, or /tmp/.katna directory.\
        After this initializer code initializes internal parameter: \
        min_confidence (for text detection)
        """
        super().__init__(weight)
        self.min_confidence = config.TextDetector.min_confidence
        self.merge_threshold = config.TextDetector.merge_threshold
        self.layerNames = config.TextDetector.layerNames
        self.frozen_weights = config.TextDetector.frozen_weights
        self.cache_subdir = config.TextDetector.cache_subdir

        try:
            self.network_folder_path = os.path.join(os.path.expanduser("~"), ".katna")
            if not os.access(self.network_folder_path, os.W_OK):
                self.network_folder_path = os.path.join("/tmp", ".katna")
            self.datadir = os.path.join(self.network_folder_path, self.cache_subdir)
            if not os.path.exists(self.datadir):
                os.makedirs(self.datadir)

            self.network_file_path = os.path.join(self.datadir, self.frozen_weights)
            if not os.path.exists(self.network_file_path):
                self.download_data()

            self.net = cv2.dnn.readNet(self.network_file_path)

        except Exception:
            raise FileNotFoundError(
                self.frozen_weights
                + " seems to be missing.\
                 Download the file and specify the full path\
                      while initializing TextDetector class"
            )

[docs]    def download_data(self):
        """Public function for downloading the network weight from the URL link, to be used for
        text detection functionality. 
        Troubleshooting tip: If you get FileNotFound error during text detector initialization,
        initialize the text detector and call this function directly to download the model file from public URL link.
        """
        # create response object
        link = config.TextDetector.model_download_link
        r = requests.get(link, stream=True)
        # download started
        print("Downloading model file...")
        # if not os.path.isfile(self.network_file_path) or not os.path.exists(self.network_file_path):
        with open(os.path.join(self.datadir, self.frozen_weights), "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
        print("Model file downloaded.")

    def __decode_predictions(self, scores, geometry):
        """Internal Function for getting bounding box and confidence values 
        from text detector dnn network output (scores, geometry)
        function takes the number of rows and columns from the scores volume, then
        initializes set of bounding box rectangles and corresponding confidence scores
        """
        (numRows, numCols) = scores.shape[2:4]
        rects = []
        confidences = []

        # loop over the number of rows
        for y in range(0, numRows):
            # extract the scores (probabilities), followed by the
            # geometrical data used to derive potential bounding box
            # coordinates that surround text
            scoresData = scores[0, 0, y]
            xData0 = geometry[0, 0, y]
            xData1 = geometry[0, 1, y]
            xData2 = geometry[0, 2, y]
            xData3 = geometry[0, 3, y]
            anglesData = geometry[0, 4, y]

            # loop over the number of columns
            for x in range(0, numCols):
                # if our score does not have sufficient probability,
                # ignore it
                if scoresData[x] < self.min_confidence:
                    continue

                # compute the offset factor as our resulting feature
                # maps will be 4x smaller than the input image
                (offsetX, offsetY) = (x * 4.0, y * 4.0)

                # extract the rotation angle for the prediction and
                # then compute the sin and cosine
                angle = anglesData[x]
                cos = np.cos(angle)
                sin = np.sin(angle)

                # use the geometry volume to derive the width and height
                # of the bounding box
                h = xData0[x] + xData2[x]
                w = xData1[x] + xData3[x]

                # compute both the starting and ending (x, y)-coordinates
                # for the text prediction bounding box
                endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
                endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
                startX = int(endX - w)
                startY = int(endY - h)

                # add the bounding box coordinates and probability score
                # to our respective lists
                rects.append((startX, startY, endX, endY))
                confidences.append(scoresData[x])

        # return a tuple of the bounding boxes and associated confidences
        return (rects, confidences)

    def __merge_boxes(self, rects):
        """main function to detect text boxes from image

        :param rects: list of 
        :type rects: numpy array
        :param rectsUsed: image file in numpy array/opencv format
        :type rectsUsed: numpy array

        :return: output image with the list of text boxes
        :rtype: file, list
        """

        def grouper(iterable, interval=2):
            prev = None
            group = []
            for item in iterable:
                if not prev or abs(item[1] - prev[1]) <= interval:
                    group.append(item)
                else:
                    yield group
                    group = [item]
                prev = item
            if group:
                yield group

        rects_used = []
        heights = list()
        for bbox in rects:
            heights.append(bbox[3] - bbox[1])
        heights = sorted(heights)  # Sort heights
        median_height = heights[len(heights) // 2] / 2  # Find half of the median height

        bboxes_list = sorted(
            rects, key=lambda k: k[1]
        )  # Sort the bounding boxes based on y1 coordinate ( y of the left-top coordinate )
        combined_bboxes = grouper(
            bboxes_list, median_height
        )  # Group the bounding boxes
        for group in combined_bboxes:
            x_min = min(group, key=lambda k: k[0])[0]  # Find min of x1
            x_max = max(group, key=lambda k: k[2])[2]  # Find max of x2
            y_min = min(group, key=lambda k: k[1])[1]  # Find min of y1
            y_max = max(group, key=lambda k: k[3])[3]  # Find max of y2
            rects_used.append([x_min, y_min, x_max, y_max])
        return rects_used

    def __detect_text(self):
        """Internal function to detect text bounding boxes from input image.
        Returns list of bounding boxes of each detected text field in input image.

        :param image: image file in numpy array/opencv format
        :type image: numpy array
        :param output_image: image file in numpy array/opencv format
        :type output_image: numpy array

        :return: output image with the list of text boxes
        :rtype: file, list
        """
        (H, W) = self.image.shape[:2]
        rW = W / 320
        rH = H / 320
        image = cv2.resize(self.image, (320, 320))
        (H, W) = image.shape[:2]

        # construct a blob from the image and then perform a forward pass of
        # the model to obtain the two output layer sets
        blob = cv2.dnn.blobFromImage(
            self.image, 1.0, (W, H), (123.68, 116.78, 103.94), swapRB=True, crop=False
        )

        self.net.setInput(blob)
        (scores, geometry) = self.net.forward(self.layerNames)

        rects, confidences = self.__decode_predictions(scores, geometry)
        # apply non-maxima suppression to suppress weak, overlapping bounding
        # boxes
        boxes = non_max_suppression(np.array(rects), probs=confidences)
        text_rects = []
        # loop over the bounding boxes
        for (startX, startY, endX, endY) in boxes:
            # scale the bounding box coordinates based on the respective
            # ratios

            startX = int(startX * rW)
            startY = int(startY * rH)
            endX = int(endX * rW)
            endY = int(endY * rH)
            cv2.rectangle(self.image, (startX, startY), (endX, endY), (0, 0, 255), 3)
            text_rects.append([startX, startY, endX, endY])

        text_rects = sorted(text_rects, key=lambda item: item[0])
        final_rects = text_rects
        if len(text_rects) > 0:
            final_rects = self.__merge_boxes(text_rects)

        return final_rects

[docs]    def set_image(self, image):
        """Public set_image function, This will detect all text boxes in input image and
        will saves them as internal list of text_rect to be used in get_filter_result

        :param image: input image from which needs to be cropped
        :type image: numpy array(opencv)
        """
        if image is None:
            return None
        self.image = image
        self.text_rects = self.__detect_text()

[docs]    def get_filter_result(self, crop):
        """Main public function of TextDetector filter class,
        this filter Returns false if crop contains no text, additionally
        checks for overlap between input crop rectangle and the detected
        text bounding box, returns True if No overlap (Filter will not discard input crop)
        otherwise returns False (signal for discarding input crop).
        
        :param crop: input crop rectangle to test
        :type crop: crop_rect
        :return: True if No overlap (Filter will not discard input crop) otherwise returns False 
        :rtype: bool
        """
        # rect: xs,ys,xe,ye
        # crop: x,y,w,h
        if self.text_rects is None or len(self.text_rects) == 0:
            return True

        for rect in self.text_rects:
            if not (
                (rect[2]) <= (crop.x + crop.w)
                and (rect[0]) >= (crop.x)
                and (rect[1]) >= (crop.y)
                and (rect[3]) <= (crop.y + crop.h)
            ):
                return False
            else:
                return True

        return True