function search_index

Same name in other branches
  1. 8.9.x core/modules/search/search.module \search_index()

Update the full-text search index for a particular item.

Parameters

$sid: An ID number identifying this particular item (e.g., node ID).

$module: The machine-readable name of the module that this item comes from (a module that implements hook_search_info()).

$text: The content of this item. Must be a piece of HTML or plain text.

Related topics

3 calls to search_index()
hook_update_index in modules/search/search.api.php
Update the search index for this module.
SearchMatchTestCase::_setup in modules/search/search.test
Set up a small index of items to test against.
_node_index_node in modules/node/node.module
Indexes a single node.
7 string references to 'search_index'
comment_node_view in modules/comment/comment.module
Implements hook_node_view().
hook_field_display_alter in modules/field/field.api.php
Alters the display settings of a field before it gets displayed.
hook_field_display_ENTITY_TYPE_alter in modules/field/field.api.php
Alters the display settings of a field on a given entity type before it gets displayed.
hook_update_index in modules/search/search.api.php
Update the search index for this module.
node_field_display_node_alter in modules/node/node.module
Implements hook_field_display_ENTITY_TYPE_alter().

... See full list

File

modules/search/search.module, line 554

Code

function search_index($sid, $module, $text) {
    $minimum_word_size = variable_get('minimum_word_size', 3);
    // Link matching
    global $base_url;
    $node_regexp = '@href=[\'"]?(?:' . preg_quote($base_url, '@') . '/|' . preg_quote(base_path(), '@') . ')(?:\\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
    // Multipliers for scores of words inside certain HTML tags. The weights are stored
    // in a variable so that modules can overwrite the default weights.
    // Note: 'a' must be included for link ranking to work.
    $tags = variable_get('search_tag_weights', array(
        'h1' => 25,
        'h2' => 18,
        'h3' => 15,
        'h4' => 12,
        'h5' => 9,
        'h6' => 6,
        'u' => 3,
        'b' => 3,
        'i' => 3,
        'strong' => 3,
        'em' => 3,
        'a' => 10,
    ));
    // Strip off all ignored tags to speed up processing, but insert space before/after
    // them to keep word boundaries.
    $text = str_replace(array(
        '<',
        '>',
    ), array(
        ' <',
        '> ',
    ), $text);
    $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
    // Split HTML tags from plain text.
    $split = preg_split('/\\s*<([^>]+?)>\\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
    // Note: PHP ensures the array consists of alternating delimiters and literals
    // and begins and ends with a literal (inserting $null as required).
    $tag = FALSE;
    // Odd/even counter. Tag or no tag.
    $link = FALSE;
    // State variable for link analyzer
    $score = 1;
    // Starting score per word
    $accum = ' ';
    // Accumulator for cleaned up data
    $tagstack = array();
    // Stack with open tags
    $tagwords = 0;
    // Counter for consecutive words
    $focus = 1;
    // Focus state
    $results = array(
        0 => array(),
    );
    // Accumulator for words for index
    foreach ($split as $value) {
        if ($tag) {
            // Increase or decrease score per word based on tag
            list($tagname) = explode(' ', $value, 2);
            $tagname = drupal_strtolower($tagname);
            // Closing or opening tag?
            if ($tagname[0] == '/') {
                $tagname = substr($tagname, 1);
                // If we encounter unexpected tags, reset score to avoid incorrect boosting.
                if (!count($tagstack) || $tagstack[0] != $tagname) {
                    $tagstack = array();
                    $score = 1;
                }
                else {
                    // Remove from tag stack and decrement score
                    $score = max(1, $score - $tags[array_shift($tagstack)]);
                }
                if ($tagname == 'a') {
                    $link = FALSE;
                }
            }
            else {
                if (isset($tagstack[0]) && $tagstack[0] == $tagname) {
                    // None of the tags we look for make sense when nested identically.
                    // If they are, it's probably broken HTML.
                    $tagstack = array();
                    $score = 1;
                }
                else {
                    // Add to open tag stack and increment score
                    array_unshift($tagstack, $tagname);
                    $score += $tags[$tagname];
                }
                if ($tagname == 'a') {
                    // Check if link points to a node on this site
                    if (preg_match($node_regexp, $value, $match)) {
                        $path = drupal_get_normal_path($match[1]);
                        if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
                            $linknid = $match[1];
                            if ($linknid > 0) {
                                $node = db_query('SELECT title, nid, vid FROM {node} WHERE nid = :nid', array(
                                    ':nid' => $linknid,
                                ), array(
                                    'target' => 'slave',
                                ))->fetchObject();
                                $link = TRUE;
                                $linktitle = $node->title;
                            }
                        }
                    }
                }
            }
            // A tag change occurred, reset counter.
            $tagwords = 0;
        }
        else {
            // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
            if ($value != '') {
                if ($link) {
                    // Check to see if the node link text is its URL. If so, we use the target node title instead.
                    if (preg_match('!^https?://!i', $value)) {
                        $value = $linktitle;
                    }
                }
                $words = search_index_split($value);
                foreach ($words as $word) {
                    // Add word to accumulator
                    $accum .= $word . ' ';
                    // Check wordlength
                    if (is_numeric($word) || drupal_strlen($word) >= $minimum_word_size) {
                        // Links score mainly for the target.
                        if ($link) {
                            if (!isset($results[$linknid])) {
                                $results[$linknid] = array();
                            }
                            $results[$linknid][] = $word;
                            // Reduce score of the link caption in the source.
                            $focus *= 0.2;
                        }
                        // Fall-through
                        if (!isset($results[0][$word])) {
                            $results[0][$word] = 0;
                        }
                        $results[0][$word] += $score * $focus;
                        // Focus is a decaying value in terms of the amount of unique words up to this point.
                        // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
                        $focus = min(1, 0.01 + 3.5 / (2 + count($results[0]) * 0.015));
                    }
                    $tagwords++;
                    // Too many words inside a single tag probably mean a tag was accidentally left open.
                    if (count($tagstack) && $tagwords >= 15) {
                        $tagstack = array();
                        $score = 1;
                    }
                }
            }
        }
        $tag = !$tag;
    }
    search_reindex($sid, $module, TRUE);
    // Insert cleaned up data into dataset
    db_insert('search_dataset')->fields(array(
        'sid' => $sid,
        'type' => $module,
        'data' => $accum,
        'reindex' => 0,
    ))
        ->execute();
    // Insert results into search index
    foreach ($results[0] as $word => $score) {
        // If a word already exists in the database, its score gets increased
        // appropriately. If not, we create a new record with the appropriate
        // starting score.
        db_merge('search_index')->key(array(
            'word' => $word,
            'sid' => $sid,
            'type' => $module,
        ))
            ->fields(array(
            'score' => $score,
        ))
            ->expression('score', 'score + :score', array(
            ':score' => $score,
        ))
            ->execute();
        search_dirty($word);
    }
    unset($results[0]);
    // Get all previous links from this item.
    $result = db_query("SELECT nid, caption FROM {search_node_links} WHERE sid = :sid AND type = :type", array(
        ':sid' => $sid,
        ':type' => $module,
    ), array(
        'target' => 'slave',
    ));
    $links = array();
    foreach ($result as $link) {
        $links[$link->nid] = $link->caption;
    }
    // Now store links to nodes.
    foreach ($results as $nid => $words) {
        $caption = implode(' ', $words);
        if (isset($links[$nid])) {
            if ($links[$nid] != $caption) {
                // Update the existing link and mark the node for reindexing.
                db_update('search_node_links')->fields(array(
                    'caption' => $caption,
                ))
                    ->condition('sid', $sid)
                    ->condition('type', $module)
                    ->condition('nid', $nid)
                    ->execute();
                search_touch_node($nid);
            }
            // Unset the link to mark it as processed.
            unset($links[$nid]);
        }
        elseif ($sid != $nid || $module != 'node') {
            // Insert the existing link and mark the node for reindexing, but don't
            // reindex if this is a link in a node pointing to itself.
            db_insert('search_node_links')->fields(array(
                'caption' => $caption,
                'sid' => $sid,
                'type' => $module,
                'nid' => $nid,
            ))
                ->execute();
            search_touch_node($nid);
        }
    }
    // Any left-over links in $links no longer exist. Delete them and mark the nodes for reindexing.
    foreach ($links as $nid => $caption) {
        db_delete('search_node_links')->condition('sid', $sid)
            ->condition('type', $module)
            ->condition('nid', $nid)
            ->execute();
        search_touch_node($nid);
    }
}

Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.