function SearchIndex::index

Same name in other branches
  1. 9 core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()
  2. 8.9.x core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()
  3. 11.x core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()

File

core/modules/search/src/SearchIndex.php, line 89

Class

SearchIndex
Provides search index management functions.

Namespace

Drupal\search

Code

public function index($type, $sid, $langcode, $text, $update_weights = TRUE) {
    $settings = $this->configFactory
        ->get('search.settings');
    $minimum_word_size = $settings->get('index.minimum_word_size');
    // Keep track of the words that need to have their weights updated.
    $current_words = [];
    // Multipliers for scores of words inside certain HTML tags. The weights are
    // stored in config so that modules can overwrite the default weights.
    // Note: 'a' must be included for link ranking to work.
    $tags = $settings->get('index.tag_weights');
    // Strip off all ignored tags to speed up processing, but insert space
    // before and after them to keep word boundaries.
    $text = str_replace([
        '<',
        '>',
    ], [
        ' <',
        '> ',
    ], $text);
    $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
    // Split HTML tags from plain text.
    $split = preg_split('/\\s*<([^>]+?)>\\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
    // Note: PHP ensures the array consists of alternating delimiters and
    // literals and begins and ends with a literal (inserting $null as
    // required).
    // Odd/even counter. Tag or no tag.
    $tag = FALSE;
    // Starting score per word.
    $score = 1;
    // Accumulator for cleaned up data.
    $accumulator = ' ';
    // Stack with open tags.
    $tag_stack = [];
    // Counter for consecutive words.
    $tag_words = 0;
    // Focus state.
    $focus = 1;
    // Accumulator for words for index.
    $scored_words = [];
    foreach ($split as $value) {
        if ($tag) {
            // Increase or decrease score per word based on tag.
            [
                $tagname,
            ] = explode(' ', $value, 2);
            $tagname = mb_strtolower($tagname);
            // Closing or opening tag?
            if ($tagname[0] == '/') {
                $tagname = substr($tagname, 1);
                // If we encounter unexpected tags, reset score to avoid incorrect
                // boosting.
                if (!count($tag_stack) || $tag_stack[0] != $tagname) {
                    $tag_stack = [];
                    $score = 1;
                }
                else {
                    // Remove from tag stack and decrement score.
                    $score = max(1, $score - $tags[array_shift($tag_stack)]);
                }
            }
            else {
                if (isset($tag_stack[0]) && $tag_stack[0] == $tagname) {
                    // None of the tags we look for make sense when nested identically.
                    // If they are, it's probably broken HTML.
                    $tag_stack = [];
                    $score = 1;
                }
                else {
                    // Add to open tag stack and increment score.
                    array_unshift($tag_stack, $tagname);
                    $score += $tags[$tagname];
                }
            }
            // A tag change occurred, reset counter.
            $tag_words = 0;
        }
        else {
            // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty
            // values.
            if ($value != '') {
                $words = $this->textProcessor
                    ->process($value, $langcode);
                foreach ($words as $word) {
                    // Add word to accumulator.
                    $accumulator .= $word . ' ';
                    // Check word length.
                    if (is_numeric($word) || mb_strlen($word) >= $minimum_word_size) {
                        if (!isset($scored_words[$word])) {
                            $scored_words[$word] = 0;
                        }
                        $scored_words[$word] += $score * $focus;
                        // Focus is a decaying value in terms of the amount of unique
                        // words up to this point. From 100 words and more, it decays, to
                        // e.g. 0.5 at 500 words and 0.3 at 1000 words.
                        $focus = min(1, 0.01 + 3.5 / (2 + count($scored_words) * 0.015));
                    }
                    $tag_words++;
                    // Too many words inside a single tag probably mean a tag was
                    // accidentally left open.
                    if (count($tag_stack) && $tag_words >= 15) {
                        $tag_stack = [];
                        $score = 1;
                    }
                }
            }
        }
        $tag = !$tag;
    }
    // Remove the item $sid from the search index, and invalidate the relevant
    // cache tags.
    $this->clear($type, $sid, $langcode);
    try {
        // Insert cleaned up data into dataset.
        $this->connection
            ->insert('search_dataset')
            ->fields([
            'sid' => $sid,
            'langcode' => $langcode,
            'type' => $type,
            'data' => $accumulator,
            'reindex' => 0,
        ])
            ->execute();
        // Insert results into search index.
        foreach ($scored_words as $word => $score) {
            // If a word already exists in the database, its score gets increased
            // appropriately. If not, we create a new record with the appropriate
            // starting score.
            $this->connection
                ->merge('search_index')
                ->keys([
                'word' => $word,
                'sid' => $sid,
                'langcode' => $langcode,
                'type' => $type,
            ])
                ->fields([
                'score' => $score,
            ])
                ->expression('score', '[score] + :score', [
                ':score' => $score,
            ])
                ->execute();
            $current_words[$word] = TRUE;
        }
    } catch (\Exception $e) {
        throw new SearchIndexException("Failed to insert dataset in index for type '{$type}', sid '{$sid}' and langcode '{$langcode}'", 0, $e);
    } finally {
        if ($update_weights) {
            $this->updateWordWeights($current_words);
        }
    }
    return $current_words;
}

Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.