summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRemy Pecqueur <remy.pecqueur@student.ecp.fr>2012-01-04 20:58:04 +0100
committerRemy Pecqueur <remy.pecqueur@student.ecp.fr>2012-01-04 20:58:04 +0100
commitd49b7c4095432e18ccee856c15ba91c4d2e7464c (patch)
tree8e5972e27669b4d2d1fa9f31ac96ffc1947e582e
parent08a5911bc2392792d31a5cdfa5780af1398167c4 (diff)
Functional vector searches
-rw-r--r--app/Resources/views/layout.html.twig3
-rw-r--r--app/Resources/views/vector_search.html.twig4
-rw-r--r--src/Riu/Controller/Search.php17
-rw-r--r--src/Riu/Provider/RiuServiceProvider.php6
-rw-r--r--src/Riu/Search/FrequencyWeighter.php10
-rw-r--r--src/Riu/Search/TfIdfWeighter.php20
-rw-r--r--src/Riu/Search/VectorEngine.php11
7 files changed, 49 insertions, 22 deletions
diff --git a/app/Resources/views/layout.html.twig b/app/Resources/views/layout.html.twig
index d180d61..cef03b5 100644
--- a/app/Resources/views/layout.html.twig
+++ b/app/Resources/views/layout.html.twig
@@ -25,7 +25,8 @@
<div class="container">
<a href="{{ path('homepage') }}" class="brand">Riu search</a>
<ul class="nav">
- <li class="{{ active_page|default('') == 'vector' ? 'active': '' }}"><a href="{{ path('vector_search') }}">Recherche vectorielle</a></li>
+ <li class="{{ active_page|default('') == 'vector_frequency' ? 'active': '' }}"><a href="{{ path('vector_frequency_search') }}">Recherche vectorielle (fréquence)</a></li>
+ <li class="{{ active_page|default('') == 'vector_tfidf' ? 'active': '' }}"><a href="{{ path('vector_tfidf_search') }}">Recherche vectorielle (Tf-Idf)</a></li>
<li class="{{ active_page|default('') == 'boolean' ? 'active': '' }}"><a href="{{ path('boolean_search') }}">Recherche booléenne</a></li>
<li class="{{ active_page|default('') == 'probability' ? 'active': '' }}"><a href="{{ path('probability_search') }}">Recherche probabiliste</a></li>
</ul>
diff --git a/app/Resources/views/vector_search.html.twig b/app/Resources/views/vector_search.html.twig
index 672fabc..5e26872 100644
--- a/app/Resources/views/vector_search.html.twig
+++ b/app/Resources/views/vector_search.html.twig
@@ -1,13 +1,13 @@
{% extends 'layout.html.twig' %}
-{% set active_page = 'vector' %}
+{% set active_page = type %}
{% block content %}
<div class="page-header">
<h2>Recherche vectorielle</h2>
</div>
<div>
- <form action="{{ path('vector_search') }}" method="GET">
+ <form action="{{ path(type ~ '_search') }}" method="GET">
<div class="input">
<label for="search">Critère:</label>
<input type="search" name="q" value="{{ query|default('') }}" id="search" />
diff --git a/src/Riu/Controller/Search.php b/src/Riu/Controller/Search.php
index ada1f63..3977141 100644
--- a/src/Riu/Controller/Search.php
+++ b/src/Riu/Controller/Search.php
@@ -34,15 +34,26 @@ class Search implements ControllerProviderInterface
})->bind('boolean_search');
$controllers->get('/vector', function(Application $app, Request $request){
- $params = array();
+ $params = array('type' => 'vector_frequency');
+ if ($request->query->has('q')) {
+ $query = $request->query->get('q');
+ $params['query'] = $query;
+ $params['results'] = $app['search.engine.vector.frequency']->search($query);
+ }
+
+ return $app['twig']->render('vector_search.html.twig', $params);
+ })->bind('vector_frequency_search');
+
+ $controllers->get('/vector/tf_idf', function(Application $app, Request $request){
+ $params = array('type' => 'vector_tfidf');
if ($request->query->has('q')) {
$query = $request->query->get('q');
$params['query'] = $query;
- $params['results'] = $app['search.engine.vector']->search($query);
+ $params['results'] = $app['search.engine.vector.tfidf']->search($query);
}
return $app['twig']->render('vector_search.html.twig', $params);
- })->bind('vector_search');
+ })->bind('vector_tfidf_search');
$controllers->get('/document/{id}', function(Application $app, $id){
$document = $app['search.matcher']->getDocument($id);
diff --git a/src/Riu/Provider/RiuServiceProvider.php b/src/Riu/Provider/RiuServiceProvider.php
index 73dfa24..33c2a51 100644
--- a/src/Riu/Provider/RiuServiceProvider.php
+++ b/src/Riu/Provider/RiuServiceProvider.php
@@ -46,10 +46,14 @@ class RiuServiceProvider implements ServiceProviderInterface
return new BooleanEngine($app['search.matcher']);
});
- $app['search.engine.vector'] = $app->share(function(Application $app){
+ $app['search.engine.vector.frequency'] = $app->share(function(Application $app){
return new VectorEngine($app['search.indexer'], $app['search.matcher'], $app['search.weighter.frequency']);
});
+ $app['search.engine.vector.tfidf'] = $app->share(function(Application $app){
+ return new VectorEngine($app['search.indexer'], $app['search.matcher'], $app['search.weighter.tfidf']);
+ });
+
$app['search.engine.probability'] = $app->share(function(Application $app){
return new ProbabilityEngine($app['search.matcher'], $app['search.indexer']);
});
diff --git a/src/Riu/Search/FrequencyWeighter.php b/src/Riu/Search/FrequencyWeighter.php
index c49bd07..7092fb3 100644
--- a/src/Riu/Search/FrequencyWeighter.php
+++ b/src/Riu/Search/FrequencyWeighter.php
@@ -25,12 +25,10 @@ class FrequencyWeighter implements WeighterInterface
}
}
- foreach ($this->matcher->getDocuments() as $id => $doc) {
- if (!empty($weight[$id])) {
- $max = max($weight[$id]);
- foreach ($weight[$id] as $word => $w) {
- $weight[$id][$word] /= $max;
- }
+ $max = max(max($weight));
+ foreach ($weight as $id => $t) {
+ foreach ($weight[$id] as $word => $w) {
+ $weight[$id][$word] /= $max;
}
}
diff --git a/src/Riu/Search/TfIdfWeighter.php b/src/Riu/Search/TfIdfWeighter.php
index 609d2d3..2027b94 100644
--- a/src/Riu/Search/TfIdfWeighter.php
+++ b/src/Riu/Search/TfIdfWeighter.php
@@ -32,12 +32,11 @@ class TfIdfWeighter implements WeighterInterface
}
}
- foreach ($documents as $id => $doc) {
- if (!empty($weight[$id])) {
- $max = max($weight[$id]);
- foreach($weight[$id] as $word => $w) {
- $weight[$id][$word] /= $max;
- }
+ $max = max(max($weight));
+
+ foreach ($weight as $id => $t) {
+ foreach($weight[$id] as $word => $w) {
+ $weight[$id][$word] /= $max;
}
}
@@ -60,14 +59,17 @@ class TfIdfWeighter implements WeighterInterface
$idf = log( count($documents) / count($wordUses) );
$tf = log( $count / $sum + 1 );
$weight[$word] = $tf * $idf;
+ } else {
+ $weight[$word] = 0;
}
}
$max = max($weight);
- foreach ($weight as $word => $w) {
- $weight[$word] /= $max;
+ if ($max != 0) {
+ foreach ($weight as $word => $w) {
+ $weight[$word] /= $max;
+ }
}
-
return $weight;
}
}
diff --git a/src/Riu/Search/VectorEngine.php b/src/Riu/Search/VectorEngine.php
index fbf9b57..dff1acd 100644
--- a/src/Riu/Search/VectorEngine.php
+++ b/src/Riu/Search/VectorEngine.php
@@ -28,6 +28,8 @@ class VectorEngine implements EngineInterface
$queryDoc = new Document('-1', $query);
$queryWords = $this->indexer->indexDocument($queryDoc);
$similarity = array();
+ $docNorm = array();
+ $queryNorm = 0;
if (empty($queryWords)) {
return array();
@@ -41,14 +43,23 @@ class VectorEngine implements EngineInterface
$docWeight = $this->weighter->getDocumentWeights($queryWords);
foreach ($queryWords as $word => $t) {
+ $queryNorm += $queryWeight[$word] * $queryWeight[$word];
foreach ($this->matcher->getWordUses($word) as $id => $count) {
if (isset($similarity[$id])) {
$similarity[$id] += $queryWeight[$word] * $docWeight[$id][$word];
+ $docNorm[$id] += $docWeight[$id][$word] * $docWeight[$id][$word];
} else {
$similarity[$id] = $queryWeight[$word] * $docWeight[$id][$word];
+ $docNorm[$id] = $docWeight[$id][$word] * $docWeight[$id][$word];
}
}
}
+ $queryNorm = sqrt($queryNorm);
+
+ foreach ($similarity as $id => $s) {
+ $docNorm[$id] = sqrt($docNorm[$id]);
+ $similarity[$id] = $s / ($docNorm[$id] * $queryNorm);
+ }
arsort($similarity);