# HG changeset patch # User Franz Pletz # Date 1148737570 -7200 # Node ID ede8da99ce865be346e8f685a8c587cc6b94236d # Parent 83fcf5a907fcf3d73ec6e275b7f5b0acba696cce Remove last traces of lupy diff -r 83fcf5a907fc -r ede8da99ce86 setup.py --- a/setup.py Sat May 27 15:05:10 2006 +0200 +++ b/setup.py Sat May 27 15:46:10 2006 +0200 @@ -214,16 +214,13 @@ only requiring a Python installation. 'MoinMoin.script.import', 'MoinMoin.script.maint', 'MoinMoin.script.migration', - 'MoinMoin.script.lupy', 'MoinMoin.script.old', 'MoinMoin.script.old.migration', 'MoinMoin.script.old.xmlrpc-tools', 'MoinMoin.server', 'MoinMoin.stats', 'MoinMoin.support', - 'MoinMoin.support.lupy', - 'MoinMoin.support.lupy.index', - 'MoinMoin.support.lupy.search', + 'MoinMoin.support.xapwrap', 'MoinMoin.theme', 'MoinMoin.util', 'MoinMoin.widget', # HG changeset patch # User Franz Pletz # Date 1148739585 -7200 # Node ID 943a9756bcbe09ccf97599a277e946d3439c2e16 # Parent ede8da99ce865be346e8f685a8c587cc6b94236d removed lupy documentation diff -r ede8da99ce86 -r 943a9756bcbe docs/Lupy-0.2.1/LICENSE --- a/docs/Lupy-0.2.1/LICENSE Sat May 27 15:46:10 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff -r ede8da99ce86 -r 943a9756bcbe docs/Lupy-0.2.1/README.txt --- a/docs/Lupy-0.2.1/README.txt Sat May 27 15:46:10 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -Lupy full text indexer r0.2.1 ------------------------------ - -**What is Lupy?** - Lupy is a port of the excellent Jakarta Lucene 1.2 into - Python. - -**What can I do with Lupy?** - Lupy is a full text indexer and search engine. It can be used to - index text documents such as web pages, source code, email, etc. - -**What is in this release?** - Most of Lucene 1.2 is in Lupy 0.2. Lupy supports text indexing - producing files that are binary compatible with Lucene. Index - creation, update and searching are supported. - - This release supports TermQuery, PhraseQuery and BooleanQuery. - -**What is not in this release?** - There is no locking or synchronization. - - The query parser has not been ported, nor all of the analysis/doc - parsing classes. Queries can be built using the basic building blocks. - - Tokenization is done with a simple regexp; there is no stop-lists, - Porter stemming, StandardAnalyzer or German analyzer. - - This release does not contain the following queries: - - - QueryParser - - MultiTermQuery - - FuzzyQuery - - WildCardQuery - - PrefixQuery - - RangeQuery - - Sloppy phrase queries - - DateField has not been ported. - - Merging of multiple multi-segment indices is not supported. - -**How do I get started?** - Look in the examples directory. - - Most of the Lucene documentation is relevant to Lupy: - - - http://jakarta.apache.org/lucene - - http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html - - http://darksleep.com/lucene/ - -**Performance** - Java is faster. - - -**Acknowledgements** - Many thanks to Doug Cutting and the Jakarta Lucene team for building - and enhancing such a high quality piece of open source software. - - Glyph Lefkowitz for serving as my language guru for Python and Java. - - Allen Short did the refactoring for the 0.2 release. - - I hope you find what you are searching for ;-) - amir@divmod.org diff -r ede8da99ce86 -r 943a9756bcbe docs/Lupy-0.2.1/releasenotes.txt --- a/docs/Lupy-0.2.1/releasenotes.txt Sat May 27 15:46:10 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -0.2.0 Release notes - -This release brings major reorganization of the code, grouping classes -into larger modules instead of the original Java style, as well as -rewriting several of the classes to be more Pythonic, removing -extraneous data structures and so forth; overall, the code has been -reduced by 20%. The public interface, indexer.py, has not changed; -other classes have not been changed significantly, other than being -moved to new modules. - -Also, this release changes the interface for analyzers: they are now -iterable objects that take one argument, the string to be tokenized, -and produce tokens, rather than the analysis classes ported from -Lucene. This improves performance while simplifying the code. If an -analyzer is not specified, lupy.index.documentwriter.standardTokenizer -is used. The regex used by that generator is re.compile("\\w+", re.U), -and the tokens are downcased before being stored. - -Along with this improvement in tokenization comes better Unicode -support; all text is now handled as Unicode strings. There is a -simple test for the indexing and retrieval of documents containing -non-ASCII data. # HG changeset patch # User Franz Pletz # Date 1149772306 -7200 # Node ID dc31818ae861ee356c1f85ec6685cc00a956f7bd # Parent bcdfbb11da9f3ae9e268b899e2aff8fb7610604b add xapian indexer script diff -r bcdfbb11da9f -r dc31818ae861 setup.py --- a/setup.py Mon Jun 05 19:23:26 2006 +0200 +++ b/setup.py Thu Jun 08 15:11:46 2006 +0200 @@ -212,6 +212,7 @@ only requiring a Python installation. 'MoinMoin.script.cli', 'MoinMoin.script.export', 'MoinMoin.script.import', + 'MoinMoin.script.index', 'MoinMoin.script.maint', 'MoinMoin.script.migration', 'MoinMoin.script.old', # HG changeset patch # User Franz Pletz # Date 1149772396 -7200 # Node ID ef891c474e2f6d8b93466b9acee9ab6242e9fb76 # Parent dc31818ae861ee356c1f85ec6685cc00a956f7bd started work on new parser using pyparsing diff -r dc31818ae861 -r ef891c474e2f MoinMoin/search.py --- a/MoinMoin/search.py Thu Jun 08 15:11:46 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 08 15:13:16 2006 +0200 @@ -5,7 +5,8 @@ @copyright: 2005 MoinMoin:FlorianFesti, 2005 MoinMoin:NirSoffer, 2005 MoinMoin:AlexanderSchremmer, - 2006 MoinMoin:ThomasWaldmann + 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz @license: GNU GPL, see COPYING for details """ @@ -639,6 +640,40 @@ class FoundRemote(FoundPage): ### Parse Query ############################################################################## +from pyparsing import Word, alphas, nums, oneOf, Optional, Suppress, \ + ZeroOrMore, Group, Forward + +def get_parser(): + # TODO: regexs, utf-8 + text = (Word(alphas + nums) | + Suppress('"') + Word(alphas + + nums ' ') + Suppress('"')) + text.setName('text') + + # TODO: abbreviation to any length... + prefix = oneOf(('regex', 'title', 'case', 'linkto'), caseless=True) + \ + Suppress(':') + prefix.setName('prefix') + + prefixedText = Optional(prefix) + text + prefixedText.setName('prefixedText') + + logOp = Optional(oneOf(('and', 'or'), caseless=True)) + logOp.setParseAction(lambda x, y, z: not z and 'and' or z) + logOp.setName('logOp') + + negation = oneOf(('not', '-'), caseless=True) + negation.setName('negation') + + term = Forward() + bracketedTerm = Suppress('(') + term + Suppress(')') + term << (Optional(negation) + Group(prefixedText | bracketedTerm)) + term.setName('term') + + expression = term + ZeroOrMore(logOp + term) + expression.setName('expression') + + return expression + class QueryParser: """ Converts a String into a tree of Query objects @@ -660,10 +695,68 @@ class QueryParser: if isinstance(query, str): query = query.decode(config.charset) self._query = query + #parsed_query = get_parser().parseString(query) + #result = self._build_tree(parsed_query) result = self._or_expression() if result is None: result = BaseExpression() return result + + def _build_tree(self, query, neg=False): + n = len(query) + + if query.getName() == 'prefixedName': + if n == 1: # single word + prefix = None + text = query[0] + elif n == 2: # prefixed word + prefix, text = q + else: + raise Exception # XXX + + title_search = self.titlesearch + regex = self.regex + case = self.case + linkto = False + + if prefix: + if prefix == 'title': + title_search = True + elif prefix == 'regex': + regex = True + elif prefix == 'case': + case = True + elif prefix == 'linkto': + linkto = True + else: + raise Exception # XXX + + if linkto: + obj = LinkSearch(text, use_re=regex, case=case) + elif title_search: + obj = TitleSearch(text, use_re=regex, case=case) + else: + obj = TextSearch(text, use_re=regex, case=case) + + if neg: + obj.negate() + + return obj + + result = None + for q in query: + name = q.getName() + if name == 'term': + conj = [AndExpression()] + for i in q: + if i.getName() == 'logOp': + if i == 'or': + conj = [OrExpression(conj[-1]), AndExpression()] + else: + i = self._build_tree(i) + + return x + def _or_expression(self): result = self._and_expression() # HG changeset patch # User Franz Pletz # Date 1149777910 -7200 # Node ID 8dbfb6826497df014cd333d531faa5f431c3a187 # Parent ef891c474e2f6d8b93466b9acee9ab6242e9fb76 MoinMoin.script._util -> MoinMoin.script diff -r ef891c474e2f -r 8dbfb6826497 MoinMoin/script/index/build.py --- a/MoinMoin/script/index/build.py Thu Jun 08 15:13:16 2006 +0200 +++ b/MoinMoin/script/index/build.py Thu Jun 08 16:45:10 2006 +0200 @@ -9,8 +9,7 @@ @license: GNU GPL, see COPYING for details. """ -from MoinMoin.script import _util -from MoinMoin.script._util import MoinScript +from MoinMoin.script import MoinScript from MoinMoin.Xapian import Index class IndexScript(MoinScript): # HG changeset patch # User Franz Pletz # Date 1149868643 -7200 # Node ID 413cc62c6ec4f531ac454944487f6c49b62f2572 # Parent 8dbfb6826497df014cd333d531faa5f431c3a187 fix for the xapian indexer diff -r 8dbfb6826497 -r 413cc62c6ec4 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 08 16:45:10 2006 +0200 +++ b/MoinMoin/Xapian.py Fri Jun 09 17:57:23 2006 +0200 @@ -485,6 +485,8 @@ class Index: pagename = page.page_name mtime = page.mtime_usecs() itemid = "%s:%s" % (wikiname, pagename) + updated = False + if mode == 'update': # from #xapian: if you generate a special "unique id" term, you can just call database.replace_document(uid_term, doc) query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) @@ -627,10 +629,10 @@ class Index: read any page. Without this policy some pages will not render, which will create broken pagelinks index. """ - from MoinMoin.request import RequestCLI - from MoinMoin.security import Permissions - request = RequestCLI(request.url) - class SecurityPolicy(Permissions): + from MoinMoin.request.CLI import Request + from MoinMoin.security import Permissions + request = Request(request.url) + class SecurityPolicy(Permissions): def read(*args, **kw): return True request.user.may = SecurityPolicy(request.user) diff -r 8dbfb6826497 -r 413cc62c6ec4 MoinMoin/script/index/build.py --- a/MoinMoin/script/index/build.py Thu Jun 08 16:45:10 2006 +0200 +++ b/MoinMoin/script/index/build.py Fri Jun 09 17:57:23 2006 +0200 @@ -10,7 +10,6 @@ """ from MoinMoin.script import MoinScript -from MoinMoin.Xapian import Index class IndexScript(MoinScript): """ Xapian general index script class """ @@ -39,6 +38,7 @@ class PluginScript(IndexScript): """ Xapian index build script class """ def command(self): + from MoinMoin.Xapian import Index Index(self.request).indexPages(self.files, self.options.mode) #Index(self.request).test(self.request) # HG changeset patch # User Franz Pletz # Date 1149868826 -7200 # Node ID 2588317f8b2bdf4a9a8fa01011d3fa16f90328be # Parent 413cc62c6ec4f531ac454944487f6c49b62f2572 preliminary parser stuff diff -r 413cc62c6ec4 -r 2588317f8b2b MoinMoin/search.py --- a/MoinMoin/search.py Fri Jun 09 17:57:23 2006 +0200 +++ b/MoinMoin/search.py Fri Jun 09 18:00:26 2006 +0200 @@ -10,11 +10,12 @@ @license: GNU GPL, see COPYING for details """ -import re, time, sys, StringIO +import re, time, sys, StringIO, operator from MoinMoin import wikiutil, config from MoinMoin.Page import Page import Xapian +import xapian ############################################################################# ### query objects @@ -174,6 +175,7 @@ class AndExpression(BaseExpression): return wanted def xapian_term(self): + #return xapian.Query(xapian.Query.OP_AND, [term.xapian_term() for term in self._subterms]) return "(%s)" % " AND ".join([term.xapian_term() for term in self._subterms]) @@ -197,6 +199,7 @@ class OrExpression(AndExpression): return matches def xapian_term(self): + #return xapian.Query(xapian.Query.OP_OR, [term.xapian_term() for term in self._subterms]) return "(%s)" % " OR ".join([term.xapian_term() for term in self._subterms]) @@ -259,16 +262,33 @@ class TextSearch(BaseExpression): def xapian_term(self): if self.use_re: - return '' # xapian can't do regex search + return None # xapian can't do regex search else: analyzer = Xapian.WikiAnalyzer() terms = self._pattern.split() - terms = [list(analyzer.tokenize(t)) for t in terms] + term = [] for t in terms: term.append(" AND ".join(t)) term = "(%s OR %s)" % (self.titlesearch.xapian_term(), " AND ".join(term)) return "%s %s" % (self.negated and "NOT" or "", term) + + # all parsed wikiwords, AND'ed + terms = reduce(operator.add, + [xapian.Query( + xapian.Query.OP_AND, + list(analyzer.tokenize(t)) + ) for t in terms]) + + # titlesearch OR parsed wikiwords + term = xapian.Query(xapian.Query.OP_OR, + (self.titlesearch.xapian_term(), + xapian.Query(xapian.Query.OP_AND, terms))) + + # TODO: proper negation?! + return (not self.negated and term or + xapian.Query(xapian.Query.OP_AND_NOT, + ('U_CANT_MATCH_THIS', term))) class TitleSearch(BaseExpression): """ Term searches in pattern in page title only """ @@ -294,7 +314,7 @@ class TitleSearch(BaseExpression): return u'%s!"%s"' % (neg, unicode(self._pattern)) def highlight_re(self): - return u"(%s)" % self._pattern + return u"(%s)" % self._pattern def pageFilter(self): """ Page filter function for single title search """ @@ -326,7 +346,7 @@ class TitleSearch(BaseExpression): def xapian_term(self): if self.use_re: - return '' # xapian doesn't support regex search + return None # xapian doesn't support regex search else: analyzer = Xapian.WikiAnalyzer() terms = self._pattern.split() @@ -336,6 +356,20 @@ class TitleSearch(BaseExpression): term.append(" AND ".join(t)) term = '%s title:(%s)' % (self.negated and "NOT" or "", " AND ".join(term)) return term + + # all parsed wikiwords, AND'ed + terms = reduce(operator.add, + [xapian.Query( + xapian.Query.OP_AND, + list(analyzer.tokenize(t)) + ) for t in terms]) + + # titlesearch + #term = + + # no titlesearch for now + raise Exception + class LinkSearch(BaseExpression): """ Search the term in the pagelinks """ @@ -646,7 +680,7 @@ def get_parser(): def get_parser(): # TODO: regexs, utf-8 text = (Word(alphas + nums) | - Suppress('"') + Word(alphas + + nums ' ') + Suppress('"')) + Suppress('"') + Word(alphas + nums + ' ') + Suppress('"')) text.setName('text') # TODO: abbreviation to any length... @@ -749,6 +783,7 @@ class QueryParser: if name == 'term': conj = [AndExpression()] for i in q: + # TODO if i.getName() == 'logOp': if i == 'or': conj = [OrExpression(conj[-1]), AndExpression()] @@ -757,7 +792,6 @@ class QueryParser: return x - def _or_expression(self): result = self._and_expression() if self._query: # HG changeset patch # User Franz Pletz # Date 1149891113 -7200 # Node ID e5f79524f75dcf1ca834f1720074056cb79b8cb6 # Parent 09e8cb58f6378ab4dc0c2f2bf5575d66b131f8a4 converted moin qp to use xapian.Query objects diff -r 09e8cb58f637 -r e5f79524f75d MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Fri Jun 09 18:07:37 2006 +0200 +++ b/MoinMoin/Xapian.py Sat Jun 10 00:11:53 2006 +0200 @@ -70,7 +70,7 @@ class WikiAnalyzer: for word in self.mail_re.split(m.group("email")): if word: yield enc(word) - elif m.group("hostname"): + elif m.group("hostname"): for word in self.dot_re.split(m.group("hostname")): yield enc(word) elif m.group("num"): @@ -109,7 +109,7 @@ class UpdateQueue: try: f.write(pagename + "\n") finally: - f.close() + f.close() finally: self.writeLock.release() @@ -119,7 +119,7 @@ class UpdateQueue: try: return self._decode(self._read()) finally: - self.readLock.release() + self.readLock.release() return [] def remove(self, pages): @@ -190,7 +190,7 @@ class UpdateQueue: try: f.write(data) finally: - f.close() + f.close() def _removeFile(self): """ Remove queue file @@ -280,7 +280,7 @@ class Index: searcher.configure(self.prefixMap, self.indexValueMap) timestamp = self.mtime() break - + hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname']) self.request.cfg.xapian_searchers.append((searcher, timestamp)) return hits @@ -403,8 +403,8 @@ class Index: execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) except wikiutil.PluginMissingError: pass - else: - raise "Cannot load filter for mimetype." # XXX + #else: + # raise "Cannot load filter for mimetype." + modulename # XXX try: data = execute(self, filename) if debug: @@ -412,7 +412,7 @@ class Index: except (OSError, IOError), err: data = '' request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename)) - return mimetype, data + return mt.mime_type(), data def test(self, request): idx = xapidx.ReadOnlyIndex(self.dir) diff -r 09e8cb58f637 -r e5f79524f75d MoinMoin/search.py --- a/MoinMoin/search.py Fri Jun 09 18:07:37 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 00:11:53 2006 +0200 @@ -147,7 +147,7 @@ class AndExpression(BaseExpression): def sortByCost(self): tmp = [(term.costs(), term) for term in self._subterms] tmp.sort() - self._subterms = [item[1] for item in tmp] + self._subterms = [item[1] for item in tmp] def search(self, page): """ Search for each term, cheap searches first """ @@ -175,8 +175,33 @@ class AndExpression(BaseExpression): return wanted def xapian_term(self): - #return xapian.Query(xapian.Query.OP_AND, [term.xapian_term() for term in self._subterms]) - return "(%s)" % " AND ".join([term.xapian_term() for term in self._subterms]) + # sort negated terms + terms = [] + not_terms = [] + for term in self._subterms: + if not term.negated: + terms.append(term.xapian_term()) + else: + not_terms.append(term.xapian_term()) + + # prepare query for not negated terms + if len(terms) == 1: + t1 = xapian.Query(terms[0]) + else: + t1 = xapian.Query(xapian.Query.OP_AND, terms) + + # negated terms? + if not not_terms: + # no, just return query for not negated terms + return t1 + + # yes, link not negated and negated terms' query with a AND_NOT query + if len(not_terms) == 1: + t2 = xapian.Query(not_terms[0]) + else: + t2 = xapian.Query(xapian.Query.OP_AND, not_terms) + + return xapian.Query(xapian.Query.OP_AND_NOT, t1, t2) class OrExpression(AndExpression): @@ -199,8 +224,8 @@ class OrExpression(AndExpression): return matches def xapian_term(self): - #return xapian.Query(xapian.Query.OP_OR, [term.xapian_term() for term in self._subterms]) - return "(%s)" % " OR ".join([term.xapian_term() for term in self._subterms]) + # XXX: negated terms managed by _moinSearch? + return xapian.Query(xapian.Query.OP_OR, [term.xapian_term() for term in self._subterms]) class TextSearch(BaseExpression): @@ -267,28 +292,22 @@ class TextSearch(BaseExpression): analyzer = Xapian.WikiAnalyzer() terms = self._pattern.split() - term = [] + # all parsed wikiwords, AND'ed + queries = [] for t in terms: - term.append(" AND ".join(t)) - term = "(%s OR %s)" % (self.titlesearch.xapian_term(), " AND ".join(term)) - return "%s %s" % (self.negated and "NOT" or "", term) - - # all parsed wikiwords, AND'ed - terms = reduce(operator.add, - [xapian.Query( - xapian.Query.OP_AND, - list(analyzer.tokenize(t)) - ) for t in terms]) + t = [i.encode('utf-8') for i in list(analyzer.tokenize(t))] + if len(t) < 2: + queries.append(xapian.Query(t[0])) + else: + queries.append(xapian.Query(xapian.Query.OP_AND, t)) # titlesearch OR parsed wikiwords term = xapian.Query(xapian.Query.OP_OR, (self.titlesearch.xapian_term(), - xapian.Query(xapian.Query.OP_AND, terms))) - - # TODO: proper negation?! - return (not self.negated and term or - xapian.Query(xapian.Query.OP_AND_NOT, - ('U_CANT_MATCH_THIS', term))) + xapian.Query(xapian.Query.OP_AND, queries))) + + return term + class TitleSearch(BaseExpression): """ Term searches in pattern in page title only """ @@ -351,24 +370,20 @@ class TitleSearch(BaseExpression): analyzer = Xapian.WikiAnalyzer() terms = self._pattern.split() terms = [list(analyzer.tokenize(t)) for t in terms] - term = [] + + # all parsed wikiwords, AND'ed + queries = [] for t in terms: - term.append(" AND ".join(t)) - term = '%s title:(%s)' % (self.negated and "NOT" or "", " AND ".join(term)) + t = [i.encode('utf-8') for i in list(analyzer.tokenize(t))] + t = ['title:%s' % i for i in t] + if len(t) < 2: + queries.append(xapian.Query(t[0])) + else: + queries.append(xapian.Query(xapian.Query.OP_AND, t)) + + term = xapian.Query(xapian.Query.OP_AND, queries) + return term - - # all parsed wikiwords, AND'ed - terms = reduce(operator.add, - [xapian.Query( - xapian.Query.OP_AND, - list(analyzer.tokenize(t)) - ) for t in terms]) - - # titlesearch - #term = - - # no titlesearch for now - raise Exception class LinkSearch(BaseExpression): @@ -412,7 +427,7 @@ class LinkSearch(BaseExpression): return u'%s!"%s"' % (neg, unicode(self._pattern)) def highlight_re(self): - return u"(%s)" % self._textpattern + return u"(%s)" % self._textpattern def search(self, page): # Get matches in page name @@ -451,9 +466,9 @@ class LinkSearch(BaseExpression): def xapian_term(self): pattern = self.pattern if self.use_re: - return '' # xapian doesnt support regex search - else: - term = '%s linkto:%s' % (self.negated and "NOT" or "", pattern.lower()) + return None # xapian doesnt support regex search + else: + term = xapian.Query(('linkto:%s' % pattern.lower()).encode('utf-8')) return term ############################################################################ @@ -674,39 +689,6 @@ class FoundRemote(FoundPage): ### Parse Query ############################################################################## -from pyparsing import Word, alphas, nums, oneOf, Optional, Suppress, \ - ZeroOrMore, Group, Forward - -def get_parser(): - # TODO: regexs, utf-8 - text = (Word(alphas + nums) | - Suppress('"') + Word(alphas + nums + ' ') + Suppress('"')) - text.setName('text') - - # TODO: abbreviation to any length... - prefix = oneOf(('regex', 'title', 'case', 'linkto'), caseless=True) + \ - Suppress(':') - prefix.setName('prefix') - - prefixedText = Optional(prefix) + text - prefixedText.setName('prefixedText') - - logOp = Optional(oneOf(('and', 'or'), caseless=True)) - logOp.setParseAction(lambda x, y, z: not z and 'and' or z) - logOp.setName('logOp') - - negation = oneOf(('not', '-'), caseless=True) - negation.setName('negation') - - term = Forward() - bracketedTerm = Suppress('(') + term + Suppress(')') - term << (Optional(negation) + Group(prefixedText | bracketedTerm)) - term.setName('term') - - expression = term + ZeroOrMore(logOp + term) - expression.setName('expression') - - return expression class QueryParser: """ @@ -729,68 +711,10 @@ class QueryParser: if isinstance(query, str): query = query.decode(config.charset) self._query = query - #parsed_query = get_parser().parseString(query) - #result = self._build_tree(parsed_query) result = self._or_expression() if result is None: result = BaseExpression() return result - - def _build_tree(self, query, neg=False): - n = len(query) - - if query.getName() == 'prefixedName': - if n == 1: # single word - prefix = None - text = query[0] - elif n == 2: # prefixed word - prefix, text = q - else: - raise Exception # XXX - - title_search = self.titlesearch - regex = self.regex - case = self.case - linkto = False - - if prefix: - if prefix == 'title': - title_search = True - elif prefix == 'regex': - regex = True - elif prefix == 'case': - case = True - elif prefix == 'linkto': - linkto = True - else: - raise Exception # XXX - - if linkto: - obj = LinkSearch(text, use_re=regex, case=case) - elif title_search: - obj = TitleSearch(text, use_re=regex, case=case) - else: - obj = TextSearch(text, use_re=regex, case=case) - - if neg: - obj.negate() - - return obj - - result = None - for q in query: - name = q.getName() - if name == 'term': - conj = [AndExpression()] - for i in q: - # TODO - if i.getName() == 'logOp': - if i == 'or': - conj = [OrExpression(conj[-1]), AndExpression()] - else: - i = self._build_tree(i) - - return x def _or_expression(self): result = self._and_expression() @@ -866,7 +790,7 @@ class QueryParser: if match.group("NEG"): obj.negate() - return obj + return obj def isQuoted(self, text): # Empty string '' is not considered quoted @@ -976,7 +900,7 @@ class SearchResults: matchInfo, f.listitem(0), ] - write(''.join(item)) + write(''.join(item)) write(list(0)) return self.getvalue() @@ -1339,8 +1263,9 @@ class Search: try: from MoinMoin.support import xapwrap query = self.query.xapian_term() - self.request.log("xapianSearch: query = %r" % query) - query = xapwrap.index.ParsedQuery(query) + self.request.log("xapianSearch: query = %r" % + query.get_description()) + query = xapwrap.index.QObjQuery(query) hits = index.search(query) self.request.log("xapianSearch: finds: %r" % hits) def dict_decode(d): # HG changeset patch # User Franz Pletz # Date 1149891246 -7200 # Node ID cc933a8740377da52fbcee2a70c7d63353a44b91 # Parent e5f79524f75dcf1ca834f1720074056cb79b8cb6 add QObjQuery to xapwrap to support our parsed query input diff -r e5f79524f75d -r cc933a874037 MoinMoin/support/xapwrap/index.py --- a/MoinMoin/support/xapwrap/index.py Sat Jun 10 00:11:53 2006 +0200 +++ b/MoinMoin/support/xapwrap/index.py Sat Jun 10 00:14:06 2006 +0200 @@ -878,6 +878,13 @@ class RawQuery(Query): def prepare(self, queryParser): return xapian.Query(self.queryString) +class QObjQuery(Query): + def __init__(self, query): + assert isinstance(query, xapian.Query) + self.query = query + + def prepare(self, queryParser): + return self.query class SmartIndex(Index): documentFactory = Document # HG changeset patch # User Franz Pletz # Date 1149892001 -7200 # Node ID 6c9fa6bc2ed581ba73a131331efa2adb100cf734 # Parent 150bf5552d18376df6b12ba3592140a22b1ed20f update CHANGES.fpletz diff -r 150bf5552d18 -r 6c9fa6bc2ed5 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jun 10 00:17:19 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Jun 10 00:26:41 2006 +0200 @@ -1,6 +1,3 @@ Please use your CHANGES.$yourname for re -Please use your CHANGES.$yourname for recording your changes you do while -Google Summer of Code. - Branch moin/1.6-xapian-fpletz ============================= @@ -8,10 +5,12 @@ Branch moin/1.6-xapian-fpletz * ... ToDo: - * ... + * Manually parse prefixes (e.g. title:) in MoinMoin.Xapian.Index + right before searching + * Mockup the new search UI New Features: - * ... + * TBD Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -25,9 +24,12 @@ Branch moin/1.6-xapian-fpletz Diary ===== -Please make at least one entry per day (and commit it) about what your work was about. -2006-05-29 ... -2006-05-30 ... -2006-05-31 ... +2006-06-10 Changed xapian_term() functions to return xapian.Query objects +but without touching the prefixes as we don't have a prefixMap yet. Will +implement this in MoinMoin.Xapian.Index. AndExpression needed some more +tweaking to use AND_NOT because Xapian doesn't provide a pure NOT. Should +be no issue with OrExpression as _moinSearch handles this correctly. +2006-06-11 + # HG changeset patch # User Franz Pletz # Date 1149923474 -7200 # Node ID 0842d11e0f01235f460c377fc7a936bbf34e3b42 # Parent 6c9fa6bc2ed581ba73a131331efa2adb100cf734 fix combination of negated terms in AndExpression diff -r 6c9fa6bc2ed5 -r 0842d11e0f01 MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 10 00:26:41 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 09:11:14 2006 +0200 @@ -199,7 +199,7 @@ class AndExpression(BaseExpression): if len(not_terms) == 1: t2 = xapian.Query(not_terms[0]) else: - t2 = xapian.Query(xapian.Query.OP_AND, not_terms) + t2 = xapian.Query(xapian.Query.OP_OR, not_terms) return xapian.Query(xapian.Query.OP_AND_NOT, t1, t2) # HG changeset patch # User Franz Pletz # Date 1149929028 -7200 # Node ID 7250d2be26d6ea5c7e608716d2234299cbc47db7 # Parent 0842d11e0f01235f460c377fc7a936bbf34e3b42 add prefix support, make RawQuery unicode-compatible diff -r 0842d11e0f01 -r 7250d2be26d6 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Sat Jun 10 09:11:14 2006 +0200 +++ b/MoinMoin/Xapian.py Sat Jun 10 10:43:48 2006 +0200 @@ -205,6 +205,41 @@ class UpdateQueue: class Index: + indexValueMap = { + # mapping the value names we can easily fetch from the index to + # integers required by xapian. 0 and 1 are reserved by xapwrap! + 'pagename': 2, + 'attachment': 3, + 'mtime': 4, + 'wikiname': 5, + } + prefixMap = { + # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt + 'author': 'A', + 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest + #G newsGroup (or similar entity - e.g. a web forum name) + 'hostname': 'H', + 'keyword': 'K', + 'lang': 'L', # ISO Language code + #M Month (numeric format: YYYYMM) + #N ISO couNtry code (or domaiN name) + #P Pathname + #Q uniQue id + #R Raw (i.e. unstemmed) term + 'title': 'S', # Subject (or title) + 'mimetype': 'T', + 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 + # characters, a hashing scheme is used to prevent overflowing + # the Xapian term length limit (see omindex for how to do this). + #W "weak" (approximately 10 day intervals, taken as YYYYMMD from + # the D term, and changing the last digit to a '2' if it's a '3') + #X longer prefix for user-defined use + 'linkto': 'XLINKTO', # this document links to that document + #Y year (four digits) + } + + + class LockedException(Exception): pass @@ -227,38 +262,6 @@ class Index: ## if not self.exists(): ## self.indexPagesInNewThread(request) - self.indexValueMap = { - # mapping the value names we can easily fetch from the index to - # integers required by xapian. 0 and 1 are reserved by xapwrap! - 'pagename': 2, - 'attachment': 3, - 'mtime': 4, - 'wikiname': 5, - } - self.prefixMap = { # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt - 'author': 'A', - 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest - #G newsGroup (or similar entity - e.g. a web forum name) - 'hostname': 'H', - 'keyword': 'K', - 'lang': 'L', # ISO Language code - #M Month (numeric format: YYYYMM) - #N ISO couNtry code (or domaiN name) - #P Pathname - #Q uniQue id - #R Raw (i.e. unstemmed) term - 'title': 'S', # Subject (or title) - 'mimetype': 'T', - 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 - # characters, a hashing scheme is used to prevent overflowing - # the Xapian term length limit (see omindex for how to do this). - #W "weak" (approximately 10 day intervals, taken as YYYYMMD from - # the D term, and changing the last digit to a '2' if it's a '3') - #X longer prefix for user-defined use - 'linkto': 'XLINKTO', # this document links to that document - #Y year (four digits) - } - def exists(self): """ Check if index exists """ return os.path.exists(self.sig_file) diff -r 0842d11e0f01 -r 7250d2be26d6 MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 10 09:11:14 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 10:43:48 2006 +0200 @@ -375,7 +375,7 @@ class TitleSearch(BaseExpression): queries = [] for t in terms: t = [i.encode('utf-8') for i in list(analyzer.tokenize(t))] - t = ['title:%s' % i for i in t] + t = ['%s%s' % (Xapian.Index.prefixMap['title'], i) for i in t] if len(t) < 2: queries.append(xapian.Query(t[0])) else: @@ -468,7 +468,7 @@ class LinkSearch(BaseExpression): if self.use_re: return None # xapian doesnt support regex search else: - term = xapian.Query(('linkto:%s' % pattern.lower()).encode('utf-8')) + term = xapian.Query(('%s%s' % (Xapian.Index.prefixMap['linkto'], pattern.lower())).encode('utf-8')) return term ############################################################################ diff -r 0842d11e0f01 -r 7250d2be26d6 MoinMoin/support/xapwrap/index.py --- a/MoinMoin/support/xapwrap/index.py Sat Jun 10 09:11:14 2006 +0200 +++ b/MoinMoin/support/xapwrap/index.py Sat Jun 10 10:43:48 2006 +0200 @@ -872,6 +872,9 @@ class ParsedQuery(Query): class RawQuery(Query): def __init__(self, queryString): + if isinstance(queryString, unicode): + queryString = queryString.encode('utf-8') + assert isinstance(queryString, str) self.queryString = queryString # HG changeset patch # User Franz Pletz # Date 1149929313 -7200 # Node ID cbdf642bc58e1f75fcc46a4772095a7d817e45b5 # Parent 7250d2be26d6ea5c7e608716d2234299cbc47db7 fix LinkSearch, ensure lowercase comparison diff -r 7250d2be26d6 -r cbdf642bc58e MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 10 10:43:48 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 10:48:33 2006 +0200 @@ -436,12 +436,12 @@ class LinkSearch(BaseExpression): Found = True for link in page.getPageLinks(page.request): - if ((self.static and self.pattern == link) or + if ((self.static and self.pattern.lower() == link.lower()) or (not self.static and self.search_re.match(link))): break else: Found = False - + if Found: # Search in page text results = self.textsearch.search(page) # HG changeset patch # User Franz Pletz # Date 1149929598 -7200 # Node ID c0b243a72744e65ae27a0dd983d2f98720c21050 # Parent cbdf642bc58e1f75fcc46a4772095a7d817e45b5 s/\'utf-8\'/config.charset/g diff -r cbdf642bc58e -r c0b243a72744 MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 10 10:48:33 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 10:53:18 2006 +0200 @@ -295,7 +295,7 @@ class TextSearch(BaseExpression): # all parsed wikiwords, AND'ed queries = [] for t in terms: - t = [i.encode('utf-8') for i in list(analyzer.tokenize(t))] + t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))] if len(t) < 2: queries.append(xapian.Query(t[0])) else: @@ -374,8 +374,8 @@ class TitleSearch(BaseExpression): # all parsed wikiwords, AND'ed queries = [] for t in terms: - t = [i.encode('utf-8') for i in list(analyzer.tokenize(t))] - t = ['%s%s' % (Xapian.Index.prefixMap['title'], i) for i in t] + t = ['%s%s' % (Xapian.Index.prefixMap['title'], + i.encode(config.charset)) for i in list(analyzer.tokenize(t))] if len(t) < 2: queries.append(xapian.Query(t[0])) else: @@ -468,7 +468,9 @@ class LinkSearch(BaseExpression): if self.use_re: return None # xapian doesnt support regex search else: - term = xapian.Query(('%s%s' % (Xapian.Index.prefixMap['linkto'], pattern.lower())).encode('utf-8')) + term = xapian.Query(('%s%s' % + (Xapian.Index.prefixMap['linkto'], + pattern.lower())).encode(config.charset)) return term ############################################################################ # HG changeset patch # User Franz Pletz # Date 1149931707 -7200 # Node ID ea1fc283b74205208080b0c0489c9207146a2fee # Parent c0b243a72744e65ae27a0dd983d2f98720c21050 fixup xapwrap for capitalized prefixed words diff -r c0b243a72744 -r ea1fc283b742 MoinMoin/support/xapwrap/document.py --- a/MoinMoin/support/xapwrap/document.py Sat Jun 10 10:53:18 2006 +0200 +++ b/MoinMoin/support/xapwrap/document.py Sat Jun 10 11:28:27 2006 +0200 @@ -1,6 +1,7 @@ """ xapwrap.document - Pythonic wrapper around Xapian's Document API """ +import string import datetime import re import cPickle @@ -296,7 +297,7 @@ def makePairForWrite(prefix, token, pref else: # we have a map, so first translate it using the map (e.g. 'title' -> 'S') prefix = prefixMap.get(prefix, prefix.upper()) - result = prefix + token + result = '%s%s%s' % (prefix, token[0] in string.uppercase and ':' or '', token) # since return value is going into the db, it must be encoded as UTF-8 result = result.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) return checkKeyLen(result) # HG changeset patch # User Franz Pletz # Date 1149931716 -7200 # Node ID 47a674c709668f402920869715cfd3234dce1223 # Parent ea1fc283b74205208080b0c0489c9207146a2fee case-insensitive LinkSearch diff -r ea1fc283b742 -r 47a674c70966 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Sat Jun 10 11:28:27 2006 +0200 +++ b/MoinMoin/Xapian.py Sat Jun 10 11:28:36 2006 +0200 @@ -491,7 +491,9 @@ class Index: updated = False if mode == 'update': - # from #xapian: if you generate a special "unique id" term, you can just call database.replace_document(uid_term, doc) + # from #xapian: if you generate a special "unique id" term, + # you can just call database.replace_document(uid_term, doc) + # -> done in xapwrap.index.Index.index() query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) if docs: @@ -514,7 +516,7 @@ class Index: xtitle = xapdoc.TextField('title', pagename, True) # prefixed xkeywords = [xapdoc.Keyword('itemid', itemid)] for pagelink in page.getPageLinks(request): - xkeywords.append(xapdoc.Keyword('linkto', pagelink.lower())) + xkeywords.append(xapdoc.Keyword('linkto', pagelink)) xcontent = xapdoc.TextField('content', page.get_raw_body()) doc = xapdoc.Document(textFields=(xcontent, xtitle), keywords=xkeywords, diff -r ea1fc283b742 -r 47a674c70966 MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 10 11:28:27 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 11:28:36 2006 +0200 @@ -10,7 +10,7 @@ @license: GNU GPL, see COPYING for details """ -import re, time, sys, StringIO, operator +import re, time, sys, StringIO, string from MoinMoin import wikiutil, config from MoinMoin.Page import Page @@ -436,7 +436,7 @@ class LinkSearch(BaseExpression): Found = True for link in page.getPageLinks(page.request): - if ((self.static and self.pattern.lower() == link.lower()) or + if ((self.static and self.pattern == link) or (not self.static and self.search_re.match(link))): break else: @@ -468,9 +468,10 @@ class LinkSearch(BaseExpression): if self.use_re: return None # xapian doesnt support regex search else: - term = xapian.Query(('%s%s' % + term = xapian.Query(('%s%s%s' % (Xapian.Index.prefixMap['linkto'], - pattern.lower())).encode(config.charset)) + pattern[0] in string.uppercase and ':' or '', + pattern)).encode(config.charset)) return term ############################################################################ # HG changeset patch # User Franz Pletz # Date 1149944768 -7200 # Node ID 17d66aec432c309ab12d56a4a9768c5c5ce083f5 # Parent 47a674c709668f402920869715cfd3234dce1223 add Xapian.UnicodeQuery, small cleanups diff -r 47a674c70966 -r 17d66aec432c MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Sat Jun 10 11:28:36 2006 +0200 +++ b/MoinMoin/Xapian.py Sat Jun 10 15:06:08 2006 +0200 @@ -2,7 +2,8 @@ """ MoinMoin - xapian indexing search engine - @copyright: 2006 by Thomas Waldmann + @copyright: 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz @license: GNU GPL, see COPYING for details. """ debug = True @@ -10,6 +11,7 @@ import sys, os, re, codecs, errno, time import sys, os, re, codecs, errno, time from pprint import pprint +import xapian from MoinMoin.support.xapwrap import document as xapdoc from MoinMoin.support.xapwrap import index as xapidx from MoinMoin.parser.text_moin_wiki import Parser as WikiParser @@ -17,6 +19,19 @@ from MoinMoin.Page import Page from MoinMoin.Page import Page from MoinMoin import config, wikiutil from MoinMoin.util import filesys, lock + + +class UnicodeQuery(xapian.Query): + def __init__(self, *args, **kwargs): + self.encoding = kwargs.get('encoding', config.charset) + + nargs = [] + for i in args: + if isinstance(i, unicode): + i = i.encode(self.encoding) + nargs.append(i) + + xapian.Query.__init__(self, *nargs, **kwargs) ############################################################################## diff -r 47a674c70966 -r 17d66aec432c MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 10 11:28:36 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 10 15:06:08 2006 +0200 @@ -15,7 +15,8 @@ from MoinMoin.Page import Page from MoinMoin.Page import Page import Xapian -import xapian +from xapian import Query +from Xapian import UnicodeQuery ############################################################################# ### query objects @@ -186,9 +187,9 @@ class AndExpression(BaseExpression): # prepare query for not negated terms if len(terms) == 1: - t1 = xapian.Query(terms[0]) - else: - t1 = xapian.Query(xapian.Query.OP_AND, terms) + t1 = Query(terms[0]) + else: + t1 = Query(Query.OP_AND, terms) # negated terms? if not not_terms: @@ -197,11 +198,11 @@ class AndExpression(BaseExpression): # yes, link not negated and negated terms' query with a AND_NOT query if len(not_terms) == 1: - t2 = xapian.Query(not_terms[0]) - else: - t2 = xapian.Query(xapian.Query.OP_OR, not_terms) - - return xapian.Query(xapian.Query.OP_AND_NOT, t1, t2) + t2 = Query(not_terms[0]) + else: + t2 = Query(Query.OP_OR, not_terms) + + return Query(Query.OP_AND_NOT, t1, t2) class OrExpression(AndExpression): @@ -225,7 +226,7 @@ class OrExpression(AndExpression): def xapian_term(self): # XXX: negated terms managed by _moinSearch? - return xapian.Query(xapian.Query.OP_OR, [term.xapian_term() for term in self._subterms]) + return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms]) class TextSearch(BaseExpression): @@ -297,16 +298,14 @@ class TextSearch(BaseExpression): for t in terms: t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))] if len(t) < 2: - queries.append(xapian.Query(t[0])) + queries.append(UnicodeQuery(t[0])) else: - queries.append(xapian.Query(xapian.Query.OP_AND, t)) + queries.append(UnicodeQuery(Query.OP_AND, t)) # titlesearch OR parsed wikiwords - term = xapian.Query(xapian.Query.OP_OR, + return Query(Query.OP_OR, (self.titlesearch.xapian_term(), - xapian.Query(xapian.Query.OP_AND, queries))) - - return term + Query(Query.OP_AND, queries))) class TitleSearch(BaseExpression): @@ -374,16 +373,14 @@ class TitleSearch(BaseExpression): # all parsed wikiwords, AND'ed queries = [] for t in terms: - t = ['%s%s' % (Xapian.Index.prefixMap['title'], - i.encode(config.charset)) for i in list(analyzer.tokenize(t))] + t = ['%s%s' % (Xapian.Index.prefixMap['title'], i) + for i in list(analyzer.tokenize(t))] if len(t) < 2: - queries.append(xapian.Query(t[0])) + queries.append(UnicodeQuery(t[0])) else: - queries.append(xapian.Query(xapian.Query.OP_AND, t)) - - term = xapian.Query(xapian.Query.OP_AND, queries) - - return term + queries.append(UnicodeQuery(Query.OP_AND, t)) + + return Query(Query.OP_AND, queries) class LinkSearch(BaseExpression): @@ -468,11 +465,8 @@ class LinkSearch(BaseExpression): if self.use_re: return None # xapian doesnt support regex search else: - term = xapian.Query(('%s%s%s' % - (Xapian.Index.prefixMap['linkto'], - pattern[0] in string.uppercase and ':' or '', - pattern)).encode(config.charset)) - return term + return UnicodeQuery('%s:%s' % + (Xapian.Index.prefixMap['linkto'], pattern)) ############################################################################ ### Results diff -r 47a674c70966 -r 17d66aec432c MoinMoin/support/xapwrap/document.py --- a/MoinMoin/support/xapwrap/document.py Sat Jun 10 11:28:36 2006 +0200 +++ b/MoinMoin/support/xapwrap/document.py Sat Jun 10 15:06:08 2006 +0200 @@ -297,7 +297,7 @@ def makePairForWrite(prefix, token, pref else: # we have a map, so first translate it using the map (e.g. 'title' -> 'S') prefix = prefixMap.get(prefix, prefix.upper()) - result = '%s%s%s' % (prefix, token[0] in string.uppercase and ':' or '', token) + result = '%s%s%s' % (prefix, prefix[0] == 'X' and ':' or '', token) # since return value is going into the db, it must be encoded as UTF-8 result = result.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) return checkKeyLen(result) diff -r 47a674c70966 -r 17d66aec432c MoinMoin/support/xapwrap/index.py --- a/MoinMoin/support/xapwrap/index.py Sat Jun 10 11:28:36 2006 +0200 +++ b/MoinMoin/support/xapwrap/index.py Sat Jun 10 15:06:08 2006 +0200 @@ -489,16 +489,16 @@ class ReadOnlyIndex: if self.db is None: self._setupDB() - self.qp = xapian.QueryParser() + #self.qp = xapian.QueryParser() # this is vital: these options specify no language for # stemming (""), disable stemming (False), and specify an # empty stop word object (None). we need this because by # default, xapian's query parser does english stemming - s = xapian.Stem(self.STEMMING_LANGUAGE) - self.qp.set_stemmer(s) + #s = xapian.Stem(self.STEMMING_LANGUAGE) + #self.qp.set_stemmer(s) # we want query terms to be ANDed together by default - self.qp.set_default_op(self.DEFAULT_QUERY_COMBINER_OP) + #self.qp.set_default_op(self.DEFAULT_QUERY_COMBINER_OP) self._configure() log("Index %s contains %s documents" % # HG changeset patch # User Franz Pletz # Date 1149946171 -7200 # Node ID 4562cd3a4a5fdde0e548f31a6fde8a86f1cf8af5 # Parent 17d66aec432c309ab12d56a4a9768c5c5ce083f5 fix contentfilter diff -r 17d66aec432c -r 4562cd3a4a5f MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Sat Jun 10 15:06:08 2006 +0200 +++ b/MoinMoin/Xapian.py Sat Jun 10 15:29:31 2006 +0200 @@ -419,6 +419,7 @@ class Index: for modulename in mt.module_name(): try: execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) + break except wikiutil.PluginMissingError: pass #else: # HG changeset patch # User Franz Pletz # Date 1150372346 -7200 # Node ID 11a9d77e92d327b5284479d7d52a932120013a29 # Parent 4bd5f5f8f95a96e3aa5b5e6176482d93473d85ab stemming works.. in english diff -r 4bd5f5f8f95a -r 11a9d77e92d3 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Wed Jun 14 20:33:15 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 15 13:52:26 2006 +0200 @@ -20,16 +20,23 @@ from MoinMoin import config, wikiutil from MoinMoin import config, wikiutil from MoinMoin.util import filesys, lock +try: + from Stemmer import Stemmer + def getStemmer(algorithm='english'): + return Stemmer(algorithm) + use_stemming = True +except ImportError: + use_stemming = False class UnicodeQuery(xapian.Query): def __init__(self, *args, **kwargs): self.encoding = kwargs.get('encoding', config.charset) nargs = [] - for i in args: - if isinstance(i, unicode): - i = i.encode(self.encoding) - nargs.append(i) + for term in args: + if isinstance(term, unicode): + term = term.encode(self.encoding) + nargs.append(term) xapian.Query.__init__(self, *nargs, **kwargs) @@ -62,6 +69,10 @@ class WikiAnalyzer: # XXX limit stuff above to xapdoc.MAX_KEY_LEN # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) + def __init__(self): + if use_stemming: + self.stemmer = getStemmer() + def tokenize(self, value): """Yield a stream of lower cased words from a string. value must be an UNICODE object or a list of unicode objects @@ -69,6 +80,8 @@ class WikiAnalyzer: def enc(uc): """ 'encode' unicode results into whatever xapian / xapwrap wants """ lower = uc.lower() + if use_stemming: + return self.stemmer.stemWord(lower) return lower if isinstance(value, list): # used for page links @@ -93,7 +106,7 @@ class WikiAnalyzer: yield enc(word) elif m.group("word"): word = m.group("word") - yield enc(word) + yield enc(word) # if it is a CamelCaseWord, we additionally yield Camel, Case and Word if self.wikiword_re.match(word): for sm in re.finditer(self.singleword_re, word): @@ -539,7 +552,7 @@ class Index: sortFields=(xpname, xattachment, xmtime, xwname, ), ) doc.analyzerFactory = WikiAnalyzer - #search_db_language = "english" + #search_db_language = "english" # XXX: hardcoded #stemmer = xapian.Stem(search_db_language) #pagetext = page.get_raw_body().lower() #words = re.finditer(r"\w+", pagetext) diff -r 4bd5f5f8f95a -r 11a9d77e92d3 MoinMoin/search.py --- a/MoinMoin/search.py Wed Jun 14 20:33:15 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 15 13:52:26 2006 +0200 @@ -89,7 +89,8 @@ class BaseExpression: self.pattern = pattern else: pattern = re.escape(pattern) - self.search_re = re.compile(pattern, flags) + self.search_re = re.compile(r'%s[%s]*' % (pattern, + config.chars_lower), flags) self.pattern = pattern @@ -247,8 +248,18 @@ class TextSearch(BaseExpression): self.negated = 0 self.use_re = use_re self.case = case + + if self.xapian_wanted() and Xapian.use_stemming: + terms = self._pattern.split(' ') + terms = Xapian.getStemmer().stemWords(terms) + self._pattern = ' '.join(terms) + stemmed = True + else: + stemmed = False + self._build_re(self._pattern, use_re=use_re, case=case) - self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case) + self.titlesearch = TitleSearch(self._pattern, use_re=use_re, + case=case, stemmed=stemmed) def costs(self): return 10000 @@ -292,7 +303,7 @@ class TextSearch(BaseExpression): else: analyzer = Xapian.WikiAnalyzer() terms = self._pattern.split() - + # all parsed wikiwords, AND'ed queries = [] for t in terms: @@ -311,7 +322,7 @@ class TitleSearch(BaseExpression): class TitleSearch(BaseExpression): """ Term searches in pattern in page title only """ - def __init__(self, pattern, use_re=False, case=False): + def __init__(self, pattern, use_re=False, case=False, stemmed=False): """ Init a title search @param pattern: pattern to search for, ascii string or unicode @@ -322,7 +333,13 @@ class TitleSearch(BaseExpression): self.negated = 0 self.use_re = use_re self.case = case - self._build_re(unicode(pattern), use_re=use_re, case=case) + + if not stemmed and self.xapian_wanted() and Xapian.use_stemming: + terms = self._pattern.split(' ') + terms = Xapian.getStemmer().stemWords(terms) + self._pattern = ' '.join(terms) + + self._build_re(self._pattern, use_re=use_re, case=case) def costs(self): return 100 diff -r 4bd5f5f8f95a -r 11a9d77e92d3 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Wed Jun 14 20:33:15 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jun 15 13:52:26 2006 +0200 @@ -2,11 +2,13 @@ Branch moin/1.6-xapian-fpletz ============================= Known main issues: - * ... + * Stemming in English only for now because we would have to stem every + word in a query for every language. Suggestions? ;-) + * Somethings' wrong with the matching of stemmed terms, i.e. matches + beyond single WikiWord borders although matching lower-case only + (see MoinMoin/search.py:92) ToDo: - * Manually parse prefixes (e.g. title:) in MoinMoin.Xapian.Index - right before searching * Mockup the new search UI New Features: @@ -31,5 +33,8 @@ tweaking to use AND_NOT because Xapian d tweaking to use AND_NOT because Xapian doesn't provide a pure NOT. Should be no issue with OrExpression as _moinSearch handles this correctly. -2006-06-11 +2006-06-11 Now handling prefixes correctly (title -> S, XLINKTO always +with ':') +2006-06-15 Integrated stemming, english only for now (see issues). + # HG changeset patch # User Franz Pletz # Date 1150372502 -7200 # Node ID 3990412057733bf3ad1f4c7a23adbb3d81310617 # Parent 11a9d77e92d327b5284479d7d52a932120013a29 added comment about stemmer used diff -r 11a9d77e92d3 -r 399041205773 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 15 13:52:26 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 15 13:55:02 2006 +0200 @@ -21,6 +21,7 @@ from MoinMoin.util import filesys, lock from MoinMoin.util import filesys, lock try: + # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ from Stemmer import Stemmer def getStemmer(algorithm='english'): return Stemmer(algorithm) # HG changeset patch # User Franz Pletz # Date 1150377794 -7200 # Node ID 04703997eb668ad6c413c5586728414b5820d1b2 # Parent fcdce5331c6e1154f5c7a972eead919fa03dbc00 added language indexing diff -r fcdce5331c6e -r 04703997eb66 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 15 13:56:27 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 15 15:23:14 2006 +0200 @@ -372,7 +372,7 @@ class Index: indexThread.join() return func - self.request.finish = joinDecorator(self.request.finish) + self.request.finish = joinDecorator(self.request.finish) indexThread.start() except: self.lock.release() @@ -405,7 +405,7 @@ class Index: indexThread.join() return func - self.request.finish = joinDecorator(self.request.finish) + self.request.finish = joinDecorator(self.request.finish) indexThread.start() except: self.lock.release() @@ -436,8 +436,8 @@ class Index: break except wikiutil.PluginMissingError: pass - #else: - # raise "Cannot load filter for mimetype." + modulename # XXX + else: + request.log("Cannot load filter for mimetype." + modulename) try: data = execute(self, filename) if debug: @@ -505,6 +505,23 @@ class Index: except (OSError, IOError), err: pass + def _get_language(self, page): + body = page.get_raw_body() + + for line in body.split('\n'): + if line.startswith('#language'): + lang = line.split(' ')[1] + try: + getStemmer(lang) + except KeyError: + break + else: + return lang + elif not line.startswith('#'): + break + + return page.request.cfg.language_default + def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @arg writer: the index writer object @@ -518,6 +535,7 @@ class Index: pagename = page.page_name mtime = page.mtime_usecs() itemid = "%s:%s" % (wikiname, pagename) + language = self._get_language(page) # XXX: Hack until we get proper metadata updated = False if mode == 'update': @@ -544,7 +562,8 @@ class Index: xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment xmtime = xapdoc.SortKey('mtime', mtime) xtitle = xapdoc.TextField('title', pagename, True) # prefixed - xkeywords = [xapdoc.Keyword('itemid', itemid)] + xkeywords = [xapdoc.Keyword('itemid', itemid), + xapdoc.Keyword('lang', language)] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) xcontent = xapdoc.TextField('content', page.get_raw_body()) @@ -645,7 +664,7 @@ class Index: fname = fname.strip() self._index_file(request, writer, fname, mode) writer.close() - request.log("indexing completed successfully in %0.2f seconds." % + request.log("indexing completed successfully in %0.2f seconds." % (time.time() - start)) self._sign() finally: # HG changeset patch # User Franz Pletz # Date 1150380688 -7200 # Node ID 813125ff0d74240edd9e8170c87daeae141ab100 # Parent 04703997eb668ad6c413c5586728414b5820d1b2 Introducing LanguageSearch diff -r 04703997eb66 -r 813125ff0d74 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 15 15:23:14 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 15 16:11:28 2006 +0200 @@ -264,6 +264,7 @@ class Index: # the D term, and changing the last digit to a '2' if it's a '3') #X longer prefix for user-defined use 'linkto': 'XLINKTO', # this document links to that document + 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in #Y year (four digits) } @@ -505,22 +506,31 @@ class Index: except (OSError, IOError), err: pass - def _get_language(self, page): + def _get_languages(self, page): body = page.get_raw_body() - + default_lang = page.request.cfg.language_default + + lang = '' for line in body.split('\n'): if line.startswith('#language'): lang = line.split(' ')[1] try: getStemmer(lang) except KeyError: + # lang is not stemmable break else: - return lang + # lang is stemmable + return (lang, lang) elif not line.startswith('#'): break - - return page.request.cfg.language_default + + if not lang: + # no lang found at all.. fallback to default language + lang = default_lang + + # return actual lang and lang to stem in + return (lang, default_lang) def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @@ -535,7 +545,8 @@ class Index: pagename = page.page_name mtime = page.mtime_usecs() itemid = "%s:%s" % (wikiname, pagename) - language = self._get_language(page) # XXX: Hack until we get proper metadata + # XXX: Hack until we get proper metadata + language, stem_language = self._get_languages(page) updated = False if mode == 'update': @@ -563,7 +574,8 @@ class Index: xmtime = xapdoc.SortKey('mtime', mtime) xtitle = xapdoc.TextField('title', pagename, True) # prefixed xkeywords = [xapdoc.Keyword('itemid', itemid), - xapdoc.Keyword('lang', language)] + xapdoc.Keyword('lang', language), + xapdoc.Keyword('stem_lang', stem_language)] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) xcontent = xapdoc.TextField('content', page.get_raw_body()) diff -r 04703997eb66 -r 813125ff0d74 MoinMoin/search.py --- a/MoinMoin/search.py Thu Jun 15 15:23:14 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 15 16:11:28 2006 +0200 @@ -404,7 +404,7 @@ class LinkSearch(BaseExpression): """ Search the term in the pagelinks """ def __init__(self, pattern, use_re=False, case=True): - """ Init a title search + """ Init a link search @param pattern: pattern to search for, ascii string or unicode @param use_re: treat pattern as re of plain text, bool @@ -483,6 +483,56 @@ class LinkSearch(BaseExpression): else: return UnicodeQuery('%s:%s' % (Xapian.Index.prefixMap['linkto'], pattern)) + + +class LanguageSearch(BaseExpression): + """ Search the pages written in a language """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a language search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + # used for search in languages, always lowercase + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = case + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return None + else: + # XXX why not return None or empty list? + return [Match()] + + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self): + pattern = self.pattern + if self.use_re: + return None # xapian doesnt support regex search + else: + self.xapian_called = True + return UnicodeQuery('%s%s' % + (Xapian.Index.prefixMap['lang'], pattern)) + ############################################################################ ### Results @@ -782,7 +832,8 @@ class QueryParser: title_search = self.titlesearch regex = self.regex case = self.case - linkto = 0 + linkto = False + lang = False for m in modifiers: if "title".startswith(m): @@ -793,8 +844,12 @@ class QueryParser: case = True elif "linkto".startswith(m): linkto = True - - if linkto: + elif "lang".startswith(m): + lang = True + + if lang: + obj = LanguageSearch(text, use_re=regex, case=False) + elif linkto: obj = LinkSearch(text, use_re=regex, case=case) elif title_search: obj = TitleSearch(text, use_re=regex, case=case) diff -r 04703997eb66 -r 813125ff0d74 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Thu Jun 15 15:23:14 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jun 15 16:11:28 2006 +0200 @@ -7,12 +7,16 @@ Branch moin/1.6-xapian-fpletz * Somethings' wrong with the matching of stemmed terms, i.e. matches beyond single WikiWord borders although matching lower-case only (see MoinMoin/search.py:92) + * Regex searching with Xapian? ToDo: + * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) * Mockup the new search UI New Features: - * TBD + * Faster search thanks to Xapian + * Searching for languages with new prefix 'lang', i.e. lang:de + Note: Only available when Xapian is activated Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -36,5 +40,6 @@ 2006-06-11 Now handling prefixes correct 2006-06-11 Now handling prefixes correctly (title -> S, XLINKTO always with ':') -2006-06-15 Integrated stemming, english only for now (see issues). - +2006-06-15 + * Integrated basic stemming, english only for now (see issues). + * Introduced LanguageSearch (lang:) # HG changeset patch # User Franz Pletz # Date 1150382656 -7200 # Node ID ac386d2622aff8fc89e0f6a745e4c170c338df4f # Parent 813125ff0d74240edd9e8170c87daeae141ab100 small cleanup, add language for attachments diff -r 813125ff0d74 -r ac386d2622af MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 15 16:11:28 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 15 16:44:16 2006 +0200 @@ -631,11 +631,12 @@ class Index: xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename xmtime = xapdoc.SortKey('mtime', mtime) xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att)) + xlanguage = xapdoc.Keyword('lang', language) mimetype, att_content = self.contentfilter(filename) xmimetype = xapdoc.TextField('mimetype', mimetype, True) xcontent = xapdoc.TextField('content', att_content) doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, ), + keywords=(xatt_itemid, xtitle, xlanguage), sortFields=(xpname, xattachment, xmtime, xwname, ), ) doc.analyzerFactory = WikiAnalyzer diff -r 813125ff0d74 -r ac386d2622af MoinMoin/search.py --- a/MoinMoin/search.py Thu Jun 15 16:11:28 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 15 16:44:16 2006 +0200 @@ -495,7 +495,7 @@ class LanguageSearch(BaseExpression): @param use_re: treat pattern as re of plain text, bool @param case: do case sensitive search, bool """ - # used for search in languages, always lowercase + # iso language code, always lowercase self._pattern = pattern.lower() self.negated = 0 self.use_re = use_re diff -r 813125ff0d74 -r ac386d2622af docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Thu Jun 15 16:11:28 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jun 15 16:44:16 2006 +0200 @@ -12,6 +12,7 @@ Branch moin/1.6-xapian-fpletz ToDo: * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) * Mockup the new search UI + * Write/update documentation for all the new search stuff New Features: * Faster search thanks to Xapian # HG changeset patch # User Franz Pletz # Date 1150396169 -7200 # Node ID 02d6697b000d3f9158ca2b805560ba41edc74ae4 # Parent ac386d2622aff8fc89e0f6a745e4c170c338df4f basic searching using stemmed and unstemmed terms diff -r ac386d2622af -r 02d6697b000d MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 15 16:44:16 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 15 20:29:29 2006 +0200 @@ -23,8 +23,6 @@ try: try: # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ from Stemmer import Stemmer - def getStemmer(algorithm='english'): - return Stemmer(algorithm) use_stemming = True except ImportError: use_stemming = False @@ -37,6 +35,8 @@ class UnicodeQuery(xapian.Query): for term in args: if isinstance(term, unicode): term = term.encode(self.encoding) + elif isinstance(term, list) or isinstance(term, tuple): + term = map(lambda t: t.encode(self.encoding), term) nargs.append(term) xapian.Query.__init__(self, *nargs, **kwargs) @@ -45,6 +45,9 @@ class UnicodeQuery(xapian.Query): ############################################################################## ### Tokenizer ############################################################################## + +def getWikiAnalyzerFactory(language='en'): + return (lambda: WikiAnalyzer(language)) class WikiAnalyzer: singleword = r"[%(u)s][%(l)s]+" % { @@ -70,19 +73,16 @@ class WikiAnalyzer: # XXX limit stuff above to xapdoc.MAX_KEY_LEN # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) - def __init__(self): - if use_stemming: - self.stemmer = getStemmer() - - def tokenize(self, value): - """Yield a stream of lower cased words from a string. - value must be an UNICODE object or a list of unicode objects - """ + def __init__(self, language=None): + if use_stemming and language: + self.stemmer = Stemmer(language) + else: + self.stemmer = None + + def raw_tokenize(self, value): def enc(uc): """ 'encode' unicode results into whatever xapian / xapwrap wants """ lower = uc.lower() - if use_stemming: - return self.stemmer.stemWord(lower) return lower if isinstance(value, list): # used for page links @@ -113,6 +113,18 @@ class WikiAnalyzer: for sm in re.finditer(self.singleword_re, word): yield enc(sm.group()) + def tokenize(self, value, flat_stemming=True): + """Yield a stream of lower cased raw and stemmed (optional) words from a string. + value must be an UNICODE object or a list of unicode objects + """ + for i in self.raw_tokenize(value): + if flat_stemming: + yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i + if self.stemmer: + yield self.stemmer.stemWord(i) + else: + yield (i, self.stemmer.stemWord(i)) + ############################################################################# ### Indexing @@ -254,7 +266,7 @@ class Index: #N ISO couNtry code (or domaiN name) #P Pathname #Q uniQue id - #R Raw (i.e. unstemmed) term + 'raw': 'R', # Raw (i.e. unstemmed) term 'title': 'S', # Subject (or title) 'mimetype': 'T', 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 @@ -495,7 +507,7 @@ class Index: keywords=(xtitle, xitemid, ), sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = WikiAnalyzer + doc.analyzerFactory = getWikiAnalyzerFactory() if mode == 'update': if debug: request.log("%s (replace %r)" % (filename, uid)) doc.uid = uid @@ -515,7 +527,7 @@ class Index: if line.startswith('#language'): lang = line.split(' ')[1] try: - getStemmer(lang) + Stemmer(lang) except KeyError: # lang is not stemmable break @@ -583,17 +595,8 @@ class Index: keywords=xkeywords, sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = WikiAnalyzer - #search_db_language = "english" # XXX: hardcoded - #stemmer = xapian.Stem(search_db_language) - #pagetext = page.get_raw_body().lower() - #words = re.finditer(r"\w+", pagetext) - #count = 0 - #for wordmatch in words: - # count += 1 - # word = wordmatch.group().encode(config.charset) - # document.add_posting('R' + stemmer.stem_word(word), count) # count should be term position in document (starting at 1) - + doc.analyzerFactory = getWikiAnalyzerFactory() + if mode == 'update': if debug: request.log("%s (replace %r)" % (pagename, uid)) doc.uid = uid @@ -636,10 +639,10 @@ class Index: xmimetype = xapdoc.TextField('mimetype', mimetype, True) xcontent = xapdoc.TextField('content', att_content) doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, xlanguage), + keywords=(xatt_itemid, xtitle, xlanguage, ), sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = WikiAnalyzer + doc.analyzerFactory = getWikiAnalyzerFactory() if mode == 'update': if debug: request.log("%s (replace %r)" % (pagename, uid)) doc.uid = uid diff -r ac386d2622af -r 02d6697b000d MoinMoin/search.py --- a/MoinMoin/search.py Thu Jun 15 16:44:16 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 15 20:29:29 2006 +0200 @@ -10,7 +10,8 @@ @license: GNU GPL, see COPYING for details """ -import re, time, sys, StringIO, string +import re, time, sys, StringIO, string, operator +from sets import Set from MoinMoin import wikiutil, config from MoinMoin.Page import Page @@ -176,15 +177,15 @@ class AndExpression(BaseExpression): wanted = wanted and term.xapian_wanted() return wanted - def xapian_term(self): + def xapian_term(self, request): # sort negated terms terms = [] not_terms = [] for term in self._subterms: if not term.negated: - terms.append(term.xapian_term()) + terms.append(term.xapian_term(request)) else: - not_terms.append(term.xapian_term()) + not_terms.append(term.xapian_term(request)) # prepare query for not negated terms if len(terms) == 1: @@ -225,9 +226,9 @@ class OrExpression(AndExpression): matches.extend(result) return matches - def xapian_term(self): + def xapian_term(self, request): # XXX: negated terms managed by _moinSearch? - return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms]) + return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms]) class TextSearch(BaseExpression): @@ -248,18 +249,8 @@ class TextSearch(BaseExpression): self.negated = 0 self.use_re = use_re self.case = case - - if self.xapian_wanted() and Xapian.use_stemming: - terms = self._pattern.split(' ') - terms = Xapian.getStemmer().stemWords(terms) - self._pattern = ' '.join(terms) - stemmed = True - else: - stemmed = False - self._build_re(self._pattern, use_re=use_re, case=case) - self.titlesearch = TitleSearch(self._pattern, use_re=use_re, - case=case, stemmed=stemmed) + self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case) def costs(self): return 10000 @@ -297,32 +288,44 @@ class TextSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): if self.use_re: return None # xapian can't do regex search else: - analyzer = Xapian.WikiAnalyzer() + analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default) terms = self._pattern.split() # all parsed wikiwords, AND'ed queries = [] + stemmed = [] for t in terms: - t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))] - if len(t) < 2: - queries.append(UnicodeQuery(t[0])) + if Xapian.use_stemming: + # stemmed OR not stemmed + tmp = [] + for i in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, i)) + stemmed.append(i[1]) + t = tmp else: - queries.append(UnicodeQuery(Query.OP_AND, t)) + # just not stemmed + t = [Query(i) for i in analyzer.tokenize(t)] + queries.append(Query(Query.OP_AND, t)) + + # TODO: hilight and sort stemmed words correctly (also in TitleSearch) + #if stemmed: + # self._build_re(' '.join(stemmed), use_re=False, + # case=self.case) # titlesearch OR parsed wikiwords return Query(Query.OP_OR, - (self.titlesearch.xapian_term(), + (self.titlesearch.xapian_term(request), Query(Query.OP_AND, queries))) class TitleSearch(BaseExpression): """ Term searches in pattern in page title only """ - def __init__(self, pattern, use_re=False, case=False, stemmed=False): + def __init__(self, pattern, use_re=False, case=False): """ Init a title search @param pattern: pattern to search for, ascii string or unicode @@ -333,12 +336,6 @@ class TitleSearch(BaseExpression): self.negated = 0 self.use_re = use_re self.case = case - - if not stemmed and self.xapian_wanted() and Xapian.use_stemming: - terms = self._pattern.split(' ') - terms = Xapian.getStemmer().stemWords(terms) - self._pattern = ' '.join(terms) - self._build_re(self._pattern, use_re=use_re, case=case) def costs(self): @@ -379,23 +376,28 @@ class TitleSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): if self.use_re: return None # xapian doesn't support regex search else: - analyzer = Xapian.WikiAnalyzer() + analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default) terms = self._pattern.split() - terms = [list(analyzer.tokenize(t)) for t in terms] + terms = [list(analyzer.raw_tokenize(t)) for t in terms] # all parsed wikiwords, AND'ed queries = [] for t in terms: - t = ['%s%s' % (Xapian.Index.prefixMap['title'], i) - for i in list(analyzer.tokenize(t))] - if len(t) < 2: - queries.append(UnicodeQuery(t[0])) + if Xapian.use_stemming: + # stemmed OR not stemmed + t = [UnicodeQuery(Query.OP_OR, ['%s%s' % + (Xapian.Index.prefixMap['title'], j) for j in i]) + for i in analyzer.tokenize(t, flat_stemming=False)] else: - queries.append(UnicodeQuery(Query.OP_AND, t)) + # just not stemmed + t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], j)) + for i in analyzer.tokenize(t)] + + queries.append(Query(Query.OP_AND, t)) return Query(Query.OP_AND, queries) @@ -476,7 +478,7 @@ class LinkSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): pattern = self.pattern if self.use_re: return None # xapian doesnt support regex search @@ -524,7 +526,7 @@ class LanguageSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): pattern = self.pattern if self.use_re: return None # xapian doesnt support regex search @@ -844,7 +846,7 @@ class QueryParser: case = True elif "linkto".startswith(m): linkto = True - elif "lang".startswith(m): + elif "language".startswith(m): lang = True if lang: @@ -1330,7 +1332,7 @@ class Search: self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap - query = self.query.xapian_term() + query = self.query.xapian_term(self.request) self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) diff -r ac386d2622af -r 02d6697b000d docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Thu Jun 15 16:44:16 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jun 15 20:29:29 2006 +0200 @@ -2,21 +2,31 @@ Branch moin/1.6-xapian-fpletz ============================= Known main issues: - * Stemming in English only for now because we would have to stem every - word in a query for every language. Suggestions? ;-) + * Stemming in English only for now because we would have to stem + every word in a query for every language. Suggestions? ;-) * Somethings' wrong with the matching of stemmed terms, i.e. matches beyond single WikiWord borders although matching lower-case only (see MoinMoin/search.py:92) + * Matching of stemmed terms is generally unreliable because the + matches (and consequently the count) are not obtained by Xapian + as _moinSearch is called with the Xapian results. Use the Xapian + matches? * Regex searching with Xapian? ToDo: - * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) + * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper + metadata) * Mockup the new search UI * Write/update documentation for all the new search stuff + * Wikifarms support (multiple indexes) + * Indexing and searching of Categories (new term prefix) + * Finish the stemming/matching stuff + * Test if indexing/searching works realiably without a stemmer + installed New Features: * Faster search thanks to Xapian - * Searching for languages with new prefix 'lang', i.e. lang:de + * Searching for languages with new prefix lang/language, i.e. lang:de Note: Only available when Xapian is activated Bugfixes (only stuff that is buggy in moin/1.6 main branch): @@ -43,4 +53,9 @@ with ':') 2006-06-15 * Integrated basic stemming, english only for now (see issues). - * Introduced LanguageSearch (lang:) + * Introduced LanguageSearch (new prefix lang/language) + * Searching now works with stemmed terms but matching is limited due + to usage of _moinSearch + +2006-06-16 + # HG changeset patch # User Franz Pletz # Date 1150570392 -7200 # Node ID 4d1bc2e5118480e9847ca84bcb8b28c4c28b3389 # Parent 71875396f8126bb73f29e1a61ff3a79ec47e4d06 indexing & searching without stemmer installed diff -r 71875396f812 -r 4d1bc2e51184 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Fri Jun 16 14:20:17 2006 +0200 +++ b/MoinMoin/Xapian.py Sat Jun 17 20:53:12 2006 +0200 @@ -523,19 +523,21 @@ class Index: default_lang = page.request.cfg.language_default lang = '' - for line in body.split('\n'): - if line.startswith('#language'): - lang = line.split(' ')[1] - try: - Stemmer(lang) - except KeyError: - # lang is not stemmable + + if use_stemming: + for line in body.split('\n'): + if line.startswith('#language'): + lang = line.split(' ')[1] + try: + Stemmer(lang) + except KeyError: + # lang is not stemmable + break + else: + # lang is stemmable + return (lang, lang) + elif not line.startswith('#'): break - else: - # lang is stemmable - return (lang, lang) - elif not line.startswith('#'): - break if not lang: # no lang found at all.. fallback to default language diff -r 71875396f812 -r 4d1bc2e51184 MoinMoin/search.py --- a/MoinMoin/search.py Fri Jun 16 14:20:17 2006 +0200 +++ b/MoinMoin/search.py Sat Jun 17 20:53:12 2006 +0200 @@ -308,7 +308,7 @@ class TextSearch(BaseExpression): t = tmp else: # just not stemmed - t = [Query(i) for i in analyzer.tokenize(t)] + t = [UnicodeQuery(i) for i in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) # TODO: hilight and sort stemmed words correctly (also in TitleSearch) @@ -394,7 +394,7 @@ class TitleSearch(BaseExpression): for i in analyzer.tokenize(t, flat_stemming=False)] else: # just not stemmed - t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], j)) + t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i)) for i in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) diff -r 71875396f812 -r 4d1bc2e51184 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Fri Jun 16 14:20:17 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Jun 17 20:53:12 2006 +0200 @@ -2,8 +2,6 @@ Branch moin/1.6-xapian-fpletz ============================= Known main issues: - * Stemming in English only for now because we would have to stem - every word in a query for every language. Suggestions? ;-) * Somethings' wrong with the matching of stemmed terms, i.e. matches beyond single WikiWord borders although matching lower-case only (see MoinMoin/search.py:92) @@ -21,8 +19,6 @@ Branch moin/1.6-xapian-fpletz * Wikifarms support (multiple indexes) * Indexing and searching of Categories (new term prefix) * Finish the stemming/matching stuff - * Test if indexing/searching works realiably without a stemmer - installed New Features: * Faster search thanks to Xapian @@ -48,8 +44,8 @@ tweaking to use AND_NOT because Xapian d tweaking to use AND_NOT because Xapian doesn't provide a pure NOT. Should be no issue with OrExpression as _moinSearch handles this correctly. -2006-06-11 Now handling prefixes correctly (title -> S, XLINKTO always -with ':') +2006-06-11 + * Now handling prefixes correctly (title -> S, XLINKTO always with ':') 2006-06-15 * Integrated basic stemming, english only for now (see issues). @@ -58,4 +54,8 @@ 2006-06-15 to usage of _moinSearch 2006-06-16 + * Indexing & searching now works without a stemmer installed (small + bugfixes) +2006-06-17 + # HG changeset patch # User Franz Pletz # Date 1150585610 -7200 # Node ID 0ccd65be56565a2b952356f1a02d4e1106f5347a # Parent 4d1bc2e5118480e9847ca84bcb8b28c4c28b3389 some more code and thinking on matching stemmed words diff -r 4d1bc2e51184 -r 0ccd65be5656 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Sat Jun 17 20:53:12 2006 +0200 +++ b/MoinMoin/Xapian.py Sun Jun 18 01:06:50 2006 +0200 @@ -36,7 +36,7 @@ class UnicodeQuery(xapian.Query): if isinstance(term, unicode): term = term.encode(self.encoding) elif isinstance(term, list) or isinstance(term, tuple): - term = map(lambda t: t.encode(self.encoding), term) + term = [t.encode(self.encoding) for t in term] nargs.append(term) xapian.Query.__init__(self, *nargs, **kwargs) diff -r 4d1bc2e51184 -r 0ccd65be5656 MoinMoin/search.py --- a/MoinMoin/search.py Sat Jun 17 20:53:12 2006 +0200 +++ b/MoinMoin/search.py Sun Jun 18 01:06:50 2006 +0200 @@ -76,7 +76,7 @@ class BaseExpression: """ return '' - def _build_re(self, pattern, use_re=False, case=False): + def _build_re(self, pattern, use_re=False, case=False, stemmed=False): """ Make a regular expression out of a text pattern """ flags = case and re.U or (re.I | re.U) if use_re: @@ -90,8 +90,15 @@ class BaseExpression: self.pattern = pattern else: pattern = re.escape(pattern) - self.search_re = re.compile(r'%s[%s]*' % (pattern, - config.chars_lower), flags) + if stemmed: + # XXX: works, but pretty CPU-intensive (obviously...) + self.search_re = re.compile(r'(?=^|[\s]+|[^%s]+)%s[%s]*' % + (config.chars_lower, case and pattern or + ''.join(['[%s%s]' % (ch.upper(), ch.lower()) + for ch in pattern]), + config.chars_lower), re.U) + else: + self.search_re = re.compile(pattern, flags) self.pattern = pattern @@ -311,10 +318,9 @@ class TextSearch(BaseExpression): t = [UnicodeQuery(i) for i in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) - # TODO: hilight and sort stemmed words correctly (also in TitleSearch) - #if stemmed: - # self._build_re(' '.join(stemmed), use_re=False, - # case=self.case) + if stemmed: + self._build_re(' '.join(stemmed), use_re=False, + case=self.case, stemmed=True) # titlesearch OR parsed wikiwords return Query(Query.OP_OR, @@ -386,18 +392,26 @@ class TitleSearch(BaseExpression): # all parsed wikiwords, AND'ed queries = [] + stemmed = [] for t in terms: if Xapian.use_stemming: # stemmed OR not stemmed - t = [UnicodeQuery(Query.OP_OR, ['%s%s' % - (Xapian.Index.prefixMap['title'], j) for j in i]) - for i in analyzer.tokenize(t, flat_stemming=False)] + tmp = [] + for i in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' % + (Xapian.Index.prefixMap['title'], j) for j in i])) + stemmed.append(i[1]) + t = tmp else: # just not stemmed t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i)) for i in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) + + if stemmed: + self._build_re(' '.join(stemmed), use_re=False, + case=self.case, stemmed=True) return Query(Query.OP_AND, queries) diff -r 4d1bc2e51184 -r 0ccd65be5656 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jun 17 20:53:12 2006 +0200 +++ b/docs/CHANGES.fpletz Sun Jun 18 01:06:50 2006 +0200 @@ -2,13 +2,12 @@ Branch moin/1.6-xapian-fpletz ============================= Known main issues: - * Somethings' wrong with the matching of stemmed terms, i.e. matches - beyond single WikiWord borders although matching lower-case only - (see MoinMoin/search.py:92) + * _moinSearch matches all characters in words when stemming, + workaround uses too much CPU * Matching of stemmed terms is generally unreliable because the matches (and consequently the count) are not obtained by Xapian as _moinSearch is called with the Xapian results. Use the Xapian - matches? + matches somehow? * Regex searching with Xapian? ToDo: @@ -58,4 +57,9 @@ 2006-06-16 bugfixes) 2006-06-17 + * Tackled some of the issues with matching stemmed words. Need some + advice on how to detect and match them reliably using the current + framework +2006-06-18 + # HG changeset patch # User Franz Pletz # Date 1150703632 -7200 # Node ID 481c72d4a181fc60d251e5e891b7591270dc5900 # Parent 210f3adb44de951a144867ec9c76700c85d042e3 support for common indices directory cfg.xapian_index_dir diff -r 210f3adb44de -r 481c72d4a181 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Sun Jun 18 01:07:21 2006 +0200 +++ b/MoinMoin/Xapian.py Mon Jun 19 09:53:52 2006 +0200 @@ -280,29 +280,34 @@ class Index: #Y year (four digits) } - - class LockedException(Exception): pass def __init__(self, request): self.request = request cache_dir = request.cfg.cache_dir - self.main_dir = os.path.join(cache_dir, 'xapian') - self.dir = os.path.join(self.main_dir, 'index') + main_dir = self._main_dir() + self.dir = os.path.join(main_dir, 'index') filesys.makeDirs(self.dir) - self.sig_file = os.path.join(self.main_dir, 'complete') - lock_dir = os.path.join(self.main_dir, 'index-lock') + self.sig_file = os.path.join(main_dir, 'complete') + lock_dir = os.path.join(main_dir, 'index-lock') self.lock = lock.WriteLock(lock_dir, timeout=3600.0, readlocktimeout=60.0) self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) - self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"), - os.path.join(self.main_dir, 'update-queue-lock')) - + self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), + os.path.join(main_dir, 'update-queue-lock')) + # Disabled until we have a sane way to build the index with a # queue in small steps. ## if not self.exists(): ## self.indexPagesInNewThread(request) + + def _main_dir(self): + if self.request.cfg.xapian_index_dir: + return os.path.join(self.request.cfg.xapian_index_dir, + self.request.cfg.siteid) + else: + return os.path.join(request.cfg.cache_dir, 'xapian') def exists(self): """ Check if index exists """ diff -r 210f3adb44de -r 481c72d4a181 MoinMoin/multiconfig.py --- a/MoinMoin/multiconfig.py Sun Jun 18 01:07:21 2006 +0200 +++ b/MoinMoin/multiconfig.py Mon Jun 19 09:53:52 2006 +0200 @@ -481,6 +481,11 @@ reStructuredText Quick Reference name = dirname + '_dir' if not getattr(self, name, None): setattr(self, name, os.path.join(data_dir, dirname)) + + # common xapian index directory + if getattr(self, 'xapian_search', False): + name = 'xapian_index_dir' + setattr(self, name, getattr(self, name, None)) # Try to decode certain names which allow unicode self._decode() diff -r 210f3adb44de -r 481c72d4a181 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sun Jun 18 01:07:21 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jun 19 09:53:52 2006 +0200 @@ -15,14 +15,16 @@ Branch moin/1.6-xapian-fpletz metadata) * Mockup the new search UI * Write/update documentation for all the new search stuff - * Wikifarms support (multiple indexes) - * Indexing and searching of Categories (new term prefix) + * Indexing and searching of categories (new term prefix) * Finish the stemming/matching stuff New Features: * Faster search thanks to Xapian * Searching for languages with new prefix lang/language, i.e. lang:de Note: Only available when Xapian is activated + * New config options: + xapian_search (bool) enables xapian-powered search + xapian_index_dir (string) directory for xapian indices Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -61,5 +63,24 @@ 2006-06-17 advice on how to detect and match them reliably using the current framework -2006-06-18 +2006-06-19 + * Introducing xapian_index_dir as a global directory for multiple + xapian indices i.e. for wikifarms. + Layout: + xapian_index_dir/ + siteid1/ + complete + index/ + index-lock/ + update-queue-lock/ + siteid2/ + complete + index/ + index-lock/ + update-queue-lock/ + ... + + Possible extension: Xapian can handle multiple databases, maybe + allow searching across defined wikis on a wikifarm? + # HG changeset patch # User Franz Pletz # Date 1150708564 -7200 # Node ID e69f2c2a238d0c6f3c45a19f11fc99505d8e4ce1 # Parent 481c72d4a181fc60d251e5e891b7591270dc5900 matching stemmed words works reliably and, most importantly, fast ;) diff -r 481c72d4a181 -r e69f2c2a238d MoinMoin/search.py --- a/MoinMoin/search.py Mon Jun 19 09:53:52 2006 +0200 +++ b/MoinMoin/search.py Mon Jun 19 11:16:04 2006 +0200 @@ -90,15 +90,7 @@ class BaseExpression: self.pattern = pattern else: pattern = re.escape(pattern) - if stemmed: - # XXX: works, but pretty CPU-intensive (obviously...) - self.search_re = re.compile(r'(?=^|[\s]+|[^%s]+)%s[%s]*' % - (config.chars_lower, case and pattern or - ''.join(['[%s%s]' % (ch.upper(), ch.lower()) - for ch in pattern]), - config.chars_lower), re.U) - else: - self.search_re = re.compile(pattern, flags) + self.search_re = re.compile(pattern, flags) self.pattern = pattern @@ -280,7 +272,23 @@ class TextSearch(BaseExpression): # Search in page body body = page.get_raw_body() for match in self.search_re.finditer(body): - matches.append(TextMatch(re_match=match)) + if Xapian.use_stemming: + # somewhere in regular word + if body[match.start()] not in config.chars_upper and \ + body[match.start()-1] in config.chars_lower: + continue + + post = 0 + for c in body[match.end():]: + if c in config.chars_lower: + post += 1 + else: + break + + matches.append(TextMatch(start=match.start(), + end=match.end()+post)) + else: + matches.append(TextMatch(re_match=match)) # Decide what to do with the results. if ((self.negated and matches) or @@ -368,7 +376,23 @@ class TitleSearch(BaseExpression): # Get matches in page name matches = [] for match in self.search_re.finditer(page.page_name): - matches.append(TitleMatch(re_match=match)) + if Xapian.use_stemming: + # somewhere in regular word + if page.page_name[match.start()] not in config.chars_upper and \ + page.page_name[match.start()-1] in config.chars_lower: + continue + + post = 0 + for c in page.page_name[match.end():]: + if c in config.chars_lower: + post += 1 + else: + break + + matches.append(TitleMatch(start=match.start(), + end=match.end()+post)) + else: + matches.append(TitleMatch(re_match=match)) if ((self.negated and matches) or (not self.negated and not matches)): diff -r 481c72d4a181 -r e69f2c2a238d docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Mon Jun 19 09:53:52 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jun 19 11:16:04 2006 +0200 @@ -2,12 +2,6 @@ Branch moin/1.6-xapian-fpletz ============================= Known main issues: - * _moinSearch matches all characters in words when stemming, - workaround uses too much CPU - * Matching of stemmed terms is generally unreliable because the - matches (and consequently the count) are not obtained by Xapian - as _moinSearch is called with the Xapian results. Use the Xapian - matches somehow? * Regex searching with Xapian? ToDo: @@ -16,7 +10,6 @@ Branch moin/1.6-xapian-fpletz * Mockup the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) - * Finish the stemming/matching stuff New Features: * Faster search thanks to Xapian @@ -83,4 +76,5 @@ 2006-06-19 Possible extension: Xapian can handle multiple databases, maybe allow searching across defined wikis on a wikifarm? + * All stemming/matching issues resolved (hopefully) # HG changeset patch # User Franz Pletz # Date 1150709460 -7200 # Node ID d93a8a6a45594bb61ec1745e1a803992eaad66cd # Parent e69f2c2a238d0c6f3c45a19f11fc99505d8e4ce1 now searching works without having xapian installed diff -r e69f2c2a238d -r d93a8a6a4559 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Mon Jun 19 11:16:04 2006 +0200 +++ b/MoinMoin/Xapian.py Mon Jun 19 11:31:00 2006 +0200 @@ -12,6 +12,7 @@ from pprint import pprint from pprint import pprint import xapian +from xapian import Query from MoinMoin.support.xapwrap import document as xapdoc from MoinMoin.support.xapwrap import index as xapidx from MoinMoin.parser.text_moin_wiki import Parser as WikiParser diff -r e69f2c2a238d -r d93a8a6a4559 MoinMoin/search.py --- a/MoinMoin/search.py Mon Jun 19 11:16:04 2006 +0200 +++ b/MoinMoin/search.py Mon Jun 19 11:31:00 2006 +0200 @@ -15,9 +15,12 @@ from MoinMoin import wikiutil, config from MoinMoin import wikiutil, config from MoinMoin.Page import Page -import Xapian -from xapian import Query -from Xapian import UnicodeQuery +try: + import Xapian + from Xapian import Query, UnicodeQuery + use_stemming = Xapian.use_stemming +except ImportError: + use_stemming = False ############################################################################# ### query objects @@ -272,7 +275,7 @@ class TextSearch(BaseExpression): # Search in page body body = page.get_raw_body() for match in self.search_re.finditer(body): - if Xapian.use_stemming: + if use_stemming: # somewhere in regular word if body[match.start()] not in config.chars_upper and \ body[match.start()-1] in config.chars_lower: @@ -314,7 +317,7 @@ class TextSearch(BaseExpression): queries = [] stemmed = [] for t in terms: - if Xapian.use_stemming: + if use_stemming: # stemmed OR not stemmed tmp = [] for i in analyzer.tokenize(t, flat_stemming=False): @@ -376,7 +379,7 @@ class TitleSearch(BaseExpression): # Get matches in page name matches = [] for match in self.search_re.finditer(page.page_name): - if Xapian.use_stemming: + if use_stemming: # somewhere in regular word if page.page_name[match.start()] not in config.chars_upper and \ page.page_name[match.start()-1] in config.chars_lower: @@ -418,7 +421,7 @@ class TitleSearch(BaseExpression): queries = [] stemmed = [] for t in terms: - if Xapian.use_stemming: + if use_stemming: # stemmed OR not stemmed tmp = [] for i in analyzer.tokenize(t, flat_stemming=False): diff -r e69f2c2a238d -r d93a8a6a4559 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Mon Jun 19 11:16:04 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jun 19 11:31:00 2006 +0200 @@ -77,4 +77,5 @@ 2006-06-19 Possible extension: Xapian can handle multiple databases, maybe allow searching across defined wikis on a wikifarm? * All stemming/matching issues resolved (hopefully) + * Works now without xapian installed (enhance error reporting) # HG changeset patch # User Franz Pletz # Date 1150709829 -7200 # Node ID db0e4edcb6ac2f17e98b91b105667fbd8c1c8439 # Parent d93a8a6a45594bb61ec1745e1a803992eaad66cd fallback to moinSearch if xapian is actived but not installed diff -r d93a8a6a4559 -r db0e4edcb6ac MoinMoin/search.py --- a/MoinMoin/search.py Mon Jun 19 11:31:00 2006 +0200 +++ b/MoinMoin/search.py Mon Jun 19 11:37:09 2006 +0200 @@ -1368,8 +1368,11 @@ class Search: return moin search in those pages. """ pages = None - index = Xapian.Index(self.request) - if index.exists() and self.query.xapian_wanted(): + try: + index = Xapian.Index(self.request) + except NameError: + index = None + if index and index.exists() and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap # HG changeset patch # User Franz Pletz # Date 1150711126 -7200 # Node ID c76dd5d97e0ea2ad74362ac3b31f3c2d0df5a7e7 # Parent db0e4edcb6ac2f17e98b91b105667fbd8c1c8439 small fixes and cleanups diff -r db0e4edcb6ac -r c76dd5d97e0e MoinMoin/multiconfig.py --- a/MoinMoin/multiconfig.py Mon Jun 19 11:37:09 2006 +0200 +++ b/MoinMoin/multiconfig.py Mon Jun 19 11:58:46 2006 +0200 @@ -276,6 +276,7 @@ reStructuredText Quick Reference # instead of just IPs xapian_search = False # disabled until xapian is finished + xapian_index_dir = None mail_login = None # or "user pwd" if you need to use SMTP AUTH mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail @@ -482,11 +483,6 @@ reStructuredText Quick Reference if not getattr(self, name, None): setattr(self, name, os.path.join(data_dir, dirname)) - # common xapian index directory - if getattr(self, 'xapian_search', False): - name = 'xapian_index_dir' - setattr(self, name, getattr(self, name, None)) - # Try to decode certain names which allow unicode self._decode() diff -r db0e4edcb6ac -r c76dd5d97e0e docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Mon Jun 19 11:37:09 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jun 19 11:58:46 2006 +0200 @@ -10,14 +10,15 @@ Branch moin/1.6-xapian-fpletz * Mockup the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) + * MoinMoin.Xapian.use_stemming -> request.cfg.xapian_use_stemming New Features: * Faster search thanks to Xapian * Searching for languages with new prefix lang/language, i.e. lang:de Note: Only available when Xapian is activated * New config options: - xapian_search (bool) enables xapian-powered search - xapian_index_dir (string) directory for xapian indices + xapian_search 0 enables xapian-powered search + xapian_index_dir None directory for xapian indices Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -75,7 +76,7 @@ 2006-06-19 ... Possible extension: Xapian can handle multiple databases, maybe - allow searching across defined wikis on a wikifarm? + allow searching across defined wikis on a wikifarm * All stemming/matching issues resolved (hopefully) * Works now without xapian installed (enhance error reporting) # HG changeset patch # User Franz Pletz # Date 1150929606 -7200 # Node ID d0af8dce4d0ec7eccfb41730e9f6e9673755972e # Parent 01750f3c867cfec061494a249c08536f5a061659 Xapian.use_stemming -> request.cfg.xapian_stemming and stemming lang bugfix diff -r 01750f3c867c -r d0af8dce4d0e MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Tue Jun 20 21:13:27 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 22 00:40:06 2006 +0200 @@ -24,9 +24,9 @@ try: try: # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ from Stemmer import Stemmer - use_stemming = True + stemmer_available = True except ImportError: - use_stemming = False + stemmer_available = False class UnicodeQuery(xapian.Query): def __init__(self, *args, **kwargs): @@ -47,8 +47,8 @@ class UnicodeQuery(xapian.Query): ### Tokenizer ############################################################################## -def getWikiAnalyzerFactory(language='en'): - return (lambda: WikiAnalyzer(language)) +def getWikiAnalyzerFactory(request=None, language='en'): + return (lambda: WikiAnalyzer(request, language)) class WikiAnalyzer: singleword = r"[%(u)s][%(l)s]+" % { @@ -74,8 +74,8 @@ class WikiAnalyzer: # XXX limit stuff above to xapdoc.MAX_KEY_LEN # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) - def __init__(self, language=None): - if use_stemming and language: + def __init__(self, request=None, language=None): + if request and request.cfg.xapian_stemming and language: self.stemmer = Stemmer(language) else: self.stemmer = None @@ -302,6 +302,10 @@ class Index: # queue in small steps. ## if not self.exists(): ## self.indexPagesInNewThread(request) + + # Check if we should and can stem words + if request.cfg.xapian_stemming and not stemmer_available: + request.cfg.xapian_stemming = False def _main_dir(self): if self.request.cfg.xapian_index_dir: @@ -530,7 +534,7 @@ class Index: lang = '' - if use_stemming: + if page.request.cfg.xapian_stemming: for line in body.split('\n'): if line.startswith('#language'): lang = line.split(' ')[1] @@ -603,7 +607,8 @@ class Index: keywords=xkeywords, sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = getWikiAnalyzerFactory() + doc.analyzerFactory = getWikiAnalyzerFactory(request, + stem_language) if mode == 'update': if debug: request.log("%s (replace %r)" % (pagename, uid)) @@ -643,14 +648,16 @@ class Index: xmtime = xapdoc.SortKey('mtime', mtime) xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att)) xlanguage = xapdoc.Keyword('lang', language) + xstem_language = xapdoc.Keyword('stem_lang', stem_language) mimetype, att_content = self.contentfilter(filename) xmimetype = xapdoc.TextField('mimetype', mimetype, True) xcontent = xapdoc.TextField('content', att_content) doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, xlanguage, ), + keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ), sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = getWikiAnalyzerFactory() + doc.analyzerFactory = getWikiAnalyzerFactory(request, + stem_language) if mode == 'update': if debug: request.log("%s (replace %r)" % (pagename, uid)) doc.uid = uid diff -r 01750f3c867c -r d0af8dce4d0e MoinMoin/multiconfig.py --- a/MoinMoin/multiconfig.py Tue Jun 20 21:13:27 2006 +0200 +++ b/MoinMoin/multiconfig.py Thu Jun 22 00:40:06 2006 +0200 @@ -277,6 +277,7 @@ reStructuredText Quick Reference xapian_search = False # disabled until xapian is finished xapian_index_dir = None + xapian_stemming = True mail_login = None # or "user pwd" if you need to use SMTP AUTH mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail diff -r 01750f3c867c -r d0af8dce4d0e MoinMoin/search.py --- a/MoinMoin/search.py Tue Jun 20 21:13:27 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 22 00:40:06 2006 +0200 @@ -18,9 +18,8 @@ try: try: import Xapian from Xapian import Query, UnicodeQuery - use_stemming = Xapian.use_stemming except ImportError: - use_stemming = False + pass ############################################################################# ### query objects @@ -275,7 +274,7 @@ class TextSearch(BaseExpression): # Search in page body body = page.get_raw_body() for match in self.search_re.finditer(body): - if use_stemming: + if page.request.cfg.xapian_stemming: # somewhere in regular word if body[match.start()] not in config.chars_upper and \ body[match.start()-1] in config.chars_lower: @@ -310,14 +309,15 @@ class TextSearch(BaseExpression): if self.use_re: return None # xapian can't do regex search else: - analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default) + analyzer = Xapian.WikiAnalyzer(request=request, + language=request.cfg.language_default) terms = self._pattern.split() # all parsed wikiwords, AND'ed queries = [] stemmed = [] for t in terms: - if use_stemming: + if request.cfg.xapian_stemming: # stemmed OR not stemmed tmp = [] for i in analyzer.tokenize(t, flat_stemming=False): @@ -379,7 +379,7 @@ class TitleSearch(BaseExpression): # Get matches in page name matches = [] for match in self.search_re.finditer(page.page_name): - if use_stemming: + if page.request.cfg.xapian_stemming: # somewhere in regular word if page.page_name[match.start()] not in config.chars_upper and \ page.page_name[match.start()-1] in config.chars_lower: @@ -413,7 +413,8 @@ class TitleSearch(BaseExpression): if self.use_re: return None # xapian doesn't support regex search else: - analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default) + analyzer = Xapian.WikiAnalyzer(request=request, + language=request.cfg.language_default) terms = self._pattern.split() terms = [list(analyzer.raw_tokenize(t)) for t in terms] @@ -421,7 +422,7 @@ class TitleSearch(BaseExpression): queries = [] stemmed = [] for t in terms: - if use_stemming: + if request.cfg.xapian_stemming: # stemmed OR not stemmed tmp = [] for i in analyzer.tokenize(t, flat_stemming=False): diff -r 01750f3c867c -r d0af8dce4d0e docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jun 20 21:13:27 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jun 22 00:40:06 2006 +0200 @@ -3,22 +3,25 @@ Branch moin/1.6-xapian-fpletz Known main issues: * Regex searching with Xapian? + * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) ToDo: - * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper - metadata) * Mockup the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) - * MoinMoin.Xapian.use_stemming -> request.cfg.xapian_use_stemming + * Drop _moinSearch when using Xapian and use term positions provided + by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to + get the position of stemmed words right New Features: * Faster search thanks to Xapian * Searching for languages with new prefix lang/language, i.e. lang:de - Note: Only available when Xapian is activated + Note: Currently only available when Xapian is used * New config options: xapian_search 0 enables xapian-powered search xapian_index_dir None directory for xapian indices + xapian_stemming True Toggles usage of stemmer, fallback + to False if no stemmer installed Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -80,3 +83,9 @@ 2006-06-19 * All stemming/matching issues resolved (hopefully) * Works now without xapian installed (enhance error reporting) +2006-06-21 + * Making stemming configurable (xapian_stemming) with fallback to + False if no stemmer available + * Xapian.use_stemming -> request.cfg.xapian_stemming + * Fixed bug in the selection of the stemming language + # HG changeset patch # User Franz Pletz # Date 1150972799 -7200 # Node ID 04c4f745620f49e11ad713319764d585f0eb1287 # Parent d0af8dce4d0ec7eccfb41730e9f6e9673755972e small cleanup in stemmer import & availability handling diff -r d0af8dce4d0e -r 04c4f745620f MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Thu Jun 22 00:40:06 2006 +0200 +++ b/MoinMoin/Xapian.py Thu Jun 22 12:39:59 2006 +0200 @@ -24,9 +24,8 @@ try: try: # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ from Stemmer import Stemmer - stemmer_available = True except ImportError: - stemmer_available = False + Stemmer = None class UnicodeQuery(xapian.Query): def __init__(self, *args, **kwargs): @@ -304,7 +303,7 @@ class Index: ## self.indexPagesInNewThread(request) # Check if we should and can stem words - if request.cfg.xapian_stemming and not stemmer_available: + if request.cfg.xapian_stemming and not Stemmer: request.cfg.xapian_stemming = False def _main_dir(self): # HG changeset patch # User Franz Pletz # Date 1151413786 -7200 # Node ID 5469c8b911a4f261c579ddf05a96c590b4ae55ef # Parent d9bd5d6ae30d72f21730e79d6e0caa2c5f607ac3 Splitting out MoinMoin/search.py to MoinMoin/search/*.py diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/action/fckdialog.py --- a/MoinMoin/action/fckdialog.py Tue Jun 27 13:58:39 2006 +0200 +++ b/MoinMoin/action/fckdialog.py Tue Jun 27 15:09:46 2006 +0200 @@ -165,10 +165,7 @@ def page_list(request): from MoinMoin import search name = request.form.get("pagename",[""])[0] if name: - searchresult = search.searchPages( - request, - search.QueryParser().parse_query('t:"%s"' % name)) - + searchresult = search.searchPages(request, 't:"%s"' % name) pages = [p.page_name for p in searchresult.hits] else: pages = [name] @@ -209,9 +206,7 @@ def link_dialog(request): if name: from MoinMoin import search # XXX error handling! - searchresult = search.searchPages( - request, - search.QueryParser().parse_query('t:"%s"' % name)) + searchresult = search.searchPages(request, 't:"%s"' % name) pages = [p.page_name for p in searchresult.hits] pages.sort() @@ -378,9 +373,7 @@ def attachment_dialog(request): if name: from MoinMoin import search # XXX error handling! - searchresult = search.searchPages( - request, - search.QueryParser().parse_query('t:"%s"' % name)) + searchresult = search.searchPages(request, 't:"%s"' % name) pages = [p.page_name for p in searchresult.hits] pages.sort() diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Tue Jun 27 13:58:39 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Tue Jun 27 15:09:46 2006 +0200 @@ -52,14 +52,13 @@ def execute(pagename, request, fieldname 'of {{{"%s"}}}') % needle # send http headers request.http_headers() - Page(request, pagename).send_page(request, msg=err) + Page(request, pagename).send_page(request, msg=err) return # search the pages from MoinMoin import search - query = search.QueryParser(case=case, regex=regex, - titlesearch=titlesearch).parse_query(needle) - results = search.searchPages(request, query) + results = search.searchPages(request, needle, case=case, + regex=regex, titlesearch=titlesearch) # directly show a single hit # XXX won't work with attachment search diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/macro/FullSearch.py --- a/MoinMoin/macro/FullSearch.py Tue Jun 27 13:58:39 2006 +0200 +++ b/MoinMoin/macro/FullSearch.py Tue Jun 27 15:09:46 2006 +0200 @@ -54,8 +54,7 @@ def execute(macro, needle): needle = needle.strip() # Search the pages and return the results - query = search.QueryParser().parse_query(needle) - results = search.searchPages(request, query) + results = search.searchPages(request, needle) results.sortByPagename() return results.pageList(request, macro.formatter) diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/macro/__init__.py --- a/MoinMoin/macro/__init__.py Tue Jun 27 13:58:39 2006 +0200 +++ b/MoinMoin/macro/__init__.py Tue Jun 27 15:09:46 2006 +0200 @@ -328,8 +328,9 @@ class Macro: return '%s' % err # Return a title search for needle, sorted by name. - query = search.QueryParser(literal=literal, titlesearch=1, case=case).parse_query(needle) - results = search.searchPages(self.request, query) + # XXX: what's with literal? + results = search.searchPages(self.request, needle, + titlesearch=1, case=case) results.sortByPagename() return results.pageList(self.request, self.formatter) diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/xmlrpc/__init__.py --- a/MoinMoin/xmlrpc/__init__.py Tue Jun 27 13:58:39 2006 +0200 +++ b/MoinMoin/xmlrpc/__init__.py Tue Jun 27 15:09:46 2006 +0200 @@ -484,8 +484,7 @@ class XmlRpcBase: def xmlrpc_searchPages(self, query_string): from MoinMoin import search - query = search.QueryParser().parse_query(query_string) - results = search.searchPages(self.request, query) + results = search.searchPages(self.request, query_string) results.formatter = self.request.html_formatter results.request = self.request return [(self._outstr(hit.page_name), diff -r d9bd5d6ae30d -r 5469c8b911a4 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jun 27 13:58:39 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Jun 27 15:09:46 2006 +0200 @@ -89,3 +89,7 @@ 2006-06-21 * Xapian.use_stemming -> request.cfg.xapian_stemming * Fixed bug in the selection of the stemming language +2006-06-27 + * Splitting out MoinMoin/search.py to MoinMoin/search/*.py, no more + need to invoke QueryParser manually when using searchPages + diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/search/Xapian.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/search/Xapian.py Tue Jun 27 15:09:46 2006 +0200 @@ -0,0 +1,771 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - xapian indexing search engine + + @copyright: 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz + @license: GNU GPL, see COPYING for details. +""" +debug = True + +import sys, os, re, codecs, errno, time +from pprint import pprint + +import xapian +from xapian import Query +from MoinMoin.support.xapwrap import document as xapdoc +from MoinMoin.support.xapwrap import index as xapidx +from MoinMoin.parser.text_moin_wiki import Parser as WikiParser + +from MoinMoin.Page import Page +from MoinMoin import config, wikiutil +from MoinMoin.util import filesys, lock + +try: + # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ + from Stemmer import Stemmer +except ImportError: + Stemmer = None + +class UnicodeQuery(xapian.Query): + def __init__(self, *args, **kwargs): + self.encoding = kwargs.get('encoding', config.charset) + + nargs = [] + for term in args: + if isinstance(term, unicode): + term = term.encode(self.encoding) + elif isinstance(term, list) or isinstance(term, tuple): + term = [t.encode(self.encoding) for t in term] + nargs.append(term) + + xapian.Query.__init__(self, *nargs, **kwargs) + + +############################################################################## +### Tokenizer +############################################################################## + +def getWikiAnalyzerFactory(request=None, language='en'): + return (lambda: WikiAnalyzer(request, language)) + +class WikiAnalyzer: + singleword = r"[%(u)s][%(l)s]+" % { + 'u': config.chars_upper, + 'l': config.chars_lower, + } + + singleword_re = re.compile(singleword, re.U) + wikiword_re = re.compile(WikiParser.word_rule, re.U) + + token_re = re.compile( + r"(?P\w+[&@]\w+)|" + # company names like AT&T and Excite@Home. + r"(?P\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses + r"(?P\w+(\.\w+)+)|" + # hostnames + r"(?P(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers + r"(?P(\w\.)+)|" + # acronyms: U.S.A., I.B.M., etc. + r"(?P\w+)", # words (including WikiWords) + re.U) + + dot_re = re.compile(r"[-_/,.]") + mail_re = re.compile(r"[-_/,.]|(@)") + + # XXX limit stuff above to xapdoc.MAX_KEY_LEN + # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) + + def __init__(self, request=None, language=None): + if request and request.cfg.xapian_stemming and language: + self.stemmer = Stemmer(language) + else: + self.stemmer = None + + def raw_tokenize(self, value): + def enc(uc): + """ 'encode' unicode results into whatever xapian / xapwrap wants """ + lower = uc.lower() + return lower + + if isinstance(value, list): # used for page links + for v in value: + yield enc(v) + else: + tokenstream = re.finditer(self.token_re, value) + for m in tokenstream: + if m.group("acronym"): + yield enc(m.group("acronym").replace('.', '')) + elif m.group("company"): + yield enc(m.group("company")) + elif m.group("email"): + for word in self.mail_re.split(m.group("email")): + if word: + yield enc(word) + elif m.group("hostname"): + for word in self.dot_re.split(m.group("hostname")): + yield enc(word) + elif m.group("num"): + for word in self.dot_re.split(m.group("num")): + yield enc(word) + elif m.group("word"): + word = m.group("word") + yield enc(word) + # if it is a CamelCaseWord, we additionally yield Camel, Case and Word + if self.wikiword_re.match(word): + for sm in re.finditer(self.singleword_re, word): + yield enc(sm.group()) + + def tokenize(self, value, flat_stemming=True): + """Yield a stream of lower cased raw and stemmed (optional) words from a string. + value must be an UNICODE object or a list of unicode objects + """ + for i in self.raw_tokenize(value): + if flat_stemming: + yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i + if self.stemmer: + yield self.stemmer.stemWord(i) + else: + yield (i, self.stemmer.stemWord(i)) + + +############################################################################# +### Indexing +############################################################################# + +class UpdateQueue: + def __init__(self, file, lock_dir): + self.file = file + self.writeLock = lock.WriteLock(lock_dir, timeout=10.0) + self.readLock = lock.ReadLock(lock_dir, timeout=10.0) + + def exists(self): + return os.path.exists(self.file) + + def append(self, pagename): + """ Append a page to queue """ + if not self.writeLock.acquire(60.0): + request.log("can't add %r to xapian update queue: can't lock queue" % + pagename) + return + try: + f = codecs.open(self.file, 'a', config.charset) + try: + f.write(pagename + "\n") + finally: + f.close() + finally: + self.writeLock.release() + + def pages(self): + """ Return list of pages in the queue """ + if self.readLock.acquire(1.0): + try: + return self._decode(self._read()) + finally: + self.readLock.release() + return [] + + def remove(self, pages): + """ Remove pages from the queue + + When the queue is empty, the queue file is removed, so exists() + can tell if there is something waiting in the queue. + """ + if self.writeLock.acquire(30.0): + try: + queue = self._decode(self._read()) + for page in pages: + try: + queue.remove(page) + except ValueError: + pass + if queue: + self._write(queue) + else: + self._removeFile() + return True + finally: + self.writeLock.release() + return False + + # Private ------------------------------------------------------- + + def _decode(self, data): + """ Decode queue data """ + pages = data.splitlines() + return self._filterDuplicates(pages) + + def _filterDuplicates(self, pages): + """ Filter duplicates in page list, keeping the order """ + unique = [] + seen = {} + for name in pages: + if not name in seen: + unique.append(name) + seen[name] = 1 + return unique + + def _read(self): + """ Read and return queue data + + This does not do anything with the data so we can release the + lock as soon as possible, enabling others to update the queue. + """ + try: + f = codecs.open(self.file, 'r', config.charset) + try: + return f.read() + finally: + f.close() + except (OSError, IOError), err: + if err.errno != errno.ENOENT: + raise + return '' + + def _write(self, pages): + """ Write pages to queue file + + Requires queue write locking. + """ + # XXX use tmpfile/move for atomic replace on real operating systems + data = '\n'.join(pages) + '\n' + f = codecs.open(self.file, 'w', config.charset) + try: + f.write(data) + finally: + f.close() + + def _removeFile(self): + """ Remove queue file + + Requires queue write locking. + """ + try: + os.remove(self.file) + except OSError, err: + if err.errno != errno.ENOENT: + raise + + +class Index: + indexValueMap = { + # mapping the value names we can easily fetch from the index to + # integers required by xapian. 0 and 1 are reserved by xapwrap! + 'pagename': 2, + 'attachment': 3, + 'mtime': 4, + 'wikiname': 5, + } + prefixMap = { + # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt + 'author': 'A', + 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest + #G newsGroup (or similar entity - e.g. a web forum name) + 'hostname': 'H', + 'keyword': 'K', + 'lang': 'L', # ISO Language code + #M Month (numeric format: YYYYMM) + #N ISO couNtry code (or domaiN name) + #P Pathname + #Q uniQue id + 'raw': 'R', # Raw (i.e. unstemmed) term + 'title': 'S', # Subject (or title) + 'mimetype': 'T', + 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 + # characters, a hashing scheme is used to prevent overflowing + # the Xapian term length limit (see omindex for how to do this). + #W "weak" (approximately 10 day intervals, taken as YYYYMMD from + # the D term, and changing the last digit to a '2' if it's a '3') + #X longer prefix for user-defined use + 'linkto': 'XLINKTO', # this document links to that document + 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in + #Y year (four digits) + } + + class LockedException(Exception): + pass + + def __init__(self, request): + self.request = request + cache_dir = request.cfg.cache_dir + main_dir = self._main_dir() + self.dir = os.path.join(main_dir, 'index') + filesys.makeDirs(self.dir) + self.sig_file = os.path.join(main_dir, 'complete') + lock_dir = os.path.join(main_dir, 'index-lock') + self.lock = lock.WriteLock(lock_dir, + timeout=3600.0, readlocktimeout=60.0) + self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) + self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), + os.path.join(main_dir, 'update-queue-lock')) + + # Disabled until we have a sane way to build the index with a + # queue in small steps. + ## if not self.exists(): + ## self.indexPagesInNewThread(request) + + # Check if we should and can stem words + if request.cfg.xapian_stemming and not Stemmer: + request.cfg.xapian_stemming = False + + def _main_dir(self): + if self.request.cfg.xapian_index_dir: + return os.path.join(self.request.cfg.xapian_index_dir, + self.request.cfg.siteid) + else: + return os.path.join(self.request.cfg.cache_dir, 'xapian') + + def exists(self): + """ Check if index exists """ + return os.path.exists(self.sig_file) + + def mtime(self): + return os.path.getmtime(self.dir) + + def _search(self, query): + """ read lock must be acquired """ + while True: + try: + searcher, timestamp = self.request.cfg.xapian_searchers.pop() + if timestamp != self.mtime(): + searcher.close() + else: + break + except IndexError: + searcher = xapidx.ReadOnlyIndex(self.dir) + searcher.configure(self.prefixMap, self.indexValueMap) + timestamp = self.mtime() + break + + hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname']) + self.request.cfg.xapian_searchers.append((searcher, timestamp)) + return hits + + def search(self, query): + if not self.read_lock.acquire(1.0): + raise self.LockedException + try: + hits = self._search(query) + finally: + self.read_lock.release() + return hits + + def update_page(self, page): + self.queue.append(page.page_name) + self._do_queued_updates_InNewThread() + + def indexPages(self, files=None, mode='update'): + """ Index all pages (and files, if given) + + Can be called only from a script. To index pages during a user + request, use indexPagesInNewThread. + @arg files: iterator or list of files to index additionally + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + request = self._indexingRequest(self.request) + self._index_pages(request, None, files, mode) + finally: + self.lock.release() + + def indexPagesInNewThread(self, files=None, mode='update'): + """ Index all pages in a new thread + + Should be called from a user request. From a script, use indexPages. + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + # Prevent rebuilding the index just after it was finished + if self.exists(): + self.lock.release() + return + from threading import Thread + indexThread = Thread(target=self._index_pages, + args=(self._indexingRequest(self.request), self.lock, files, mode)) + indexThread.setDaemon(True) + + # Join the index thread after current request finish, prevent + # Apache CGI from killing the process. + def joinDecorator(finish): + def func(): + finish() + indexThread.join() + return func + + self.request.finish = joinDecorator(self.request.finish) + indexThread.start() + except: + self.lock.release() + raise + + def optimize(self): + pass + + # Private ---------------------------------------------------------------- + + def _do_queued_updates_InNewThread(self): + """ do queued index updates in a new thread + + Should be called from a user request. From a script, use indexPages. + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + from threading import Thread + indexThread = Thread(target=self._do_queued_updates, + args=(self._indexingRequest(self.request), self.lock)) + indexThread.setDaemon(True) + + # Join the index thread after current request finish, prevent + # Apache CGI from killing the process. + def joinDecorator(finish): + def func(): + finish() + indexThread.join() + return func + + self.request.finish = joinDecorator(self.request.finish) + indexThread.start() + except: + self.lock.release() + raise + + def _do_queued_updates(self, request, lock=None, amount=5): + """ Assumes that the write lock is acquired """ + try: + writer = xapidx.Index(self.dir, True) + writer.configure(self.prefixMap, self.indexValueMap) + pages = self.queue.pages()[:amount] + for name in pages: + p = Page(request, name) + self._index_page(writer, p, mode='update') + self.queue.remove([name]) + finally: + writer.close() + if lock: + lock.release() + + def contentfilter(self, filename): + """ Get a filter for content of filename and return unicode content. """ + request = self.request + mt = wikiutil.MimeType(filename=filename) + for modulename in mt.module_name(): + try: + execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) + break + except wikiutil.PluginMissingError: + pass + else: + request.log("Cannot load filter for mimetype." + modulename) + try: + data = execute(self, filename) + if debug: + request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename)) + except (OSError, IOError), err: + data = '' + request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename)) + return mt.mime_type(), data + + def test(self, request): + idx = xapidx.ReadOnlyIndex(self.dir) + idx.configure(self.prefixMap, self.indexValueMap) + print idx.search("is") + #for d in docs: + # request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) + + def _index_file(self, request, writer, filename, mode='update'): + """ index a file as it were a page named pagename + Assumes that the write lock is acquired + """ + fs_rootpage = 'FS' # XXX FS hardcoded + try: + wikiname = request.cfg.interwikiname or 'Self' + itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename)) + mtime = os.path.getmtime(filename) + mtime = wikiutil.timestamp2version(mtime) + if mode == 'update': + query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) + docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + if docs: + doc = docs[0] # there should be only one + uid = doc['uid'] + docmtime = long(doc['values']['mtime']) + updated = mtime > docmtime + if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) + else: + uid = None + updated = True + elif mode == 'add': + updated = True + if debug: request.log("%s %r" % (filename, updated)) + if updated: + xitemid = xapdoc.Keyword('itemid', itemid) + mimetype, file_content = self.contentfilter(filename) + xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self") + xpname = xapdoc.SortKey('pagename', fs_rootpage) + xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments + xmtime = xapdoc.SortKey('mtime', mtime) + title = " ".join(os.path.join(fs_rootpage, filename).split("/")) + xtitle = xapdoc.Keyword('title', title) + xmimetype = xapdoc.TextField('mimetype', mimetype, True) + xcontent = xapdoc.TextField('content', file_content) + doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), + keywords=(xtitle, xitemid, ), + sortFields=(xpname, xattachment, xmtime, xwname, ), + ) + doc.analyzerFactory = getWikiAnalyzerFactory() + if mode == 'update': + if debug: request.log("%s (replace %r)" % (filename, uid)) + doc.uid = uid + id = writer.index(doc) + elif mode == 'add': + if debug: request.log("%s (add)" % (filename,)) + id = writer.index(doc) + except (OSError, IOError), err: + pass + + def _get_languages(self, page): + body = page.get_raw_body() + default_lang = page.request.cfg.language_default + + lang = '' + + if page.request.cfg.xapian_stemming: + for line in body.split('\n'): + if line.startswith('#language'): + lang = line.split(' ')[1] + try: + Stemmer(lang) + except KeyError: + # lang is not stemmable + break + else: + # lang is stemmable + return (lang, lang) + elif not line.startswith('#'): + break + + if not lang: + # no lang found at all.. fallback to default language + lang = default_lang + + # return actual lang and lang to stem in + return (lang, default_lang) + + def _index_page(self, writer, page, mode='update'): + """ Index a page - assumes that the write lock is acquired + @arg writer: the index writer object + @arg page: a page object + @arg mode: 'add' = just add, no checks + 'update' = check if already in index and update if needed (mtime) + + """ + request = page.request + wikiname = request.cfg.interwikiname or "Self" + pagename = page.page_name + mtime = page.mtime_usecs() + itemid = "%s:%s" % (wikiname, pagename) + # XXX: Hack until we get proper metadata + language, stem_language = self._get_languages(page) + updated = False + + if mode == 'update': + # from #xapian: if you generate a special "unique id" term, + # you can just call database.replace_document(uid_term, doc) + # -> done in xapwrap.index.Index.index() + query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) + docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + if docs: + doc = docs[0] # there should be only one + uid = doc['uid'] + docmtime = long(doc['values']['mtime']) + updated = mtime > docmtime + if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) + else: + uid = None + updated = True + elif mode == 'add': + updated = True + if debug: request.log("%s %r" % (pagename, updated)) + if updated: + xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self") + xpname = xapdoc.SortKey('pagename', pagename) + xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment + xmtime = xapdoc.SortKey('mtime', mtime) + xtitle = xapdoc.TextField('title', pagename, True) # prefixed + xkeywords = [xapdoc.Keyword('itemid', itemid), + xapdoc.Keyword('lang', language), + xapdoc.Keyword('stem_lang', stem_language)] + for pagelink in page.getPageLinks(request): + xkeywords.append(xapdoc.Keyword('linkto', pagelink)) + xcontent = xapdoc.TextField('content', page.get_raw_body()) + doc = xapdoc.Document(textFields=(xcontent, xtitle), + keywords=xkeywords, + sortFields=(xpname, xattachment, xmtime, xwname, ), + ) + doc.analyzerFactory = getWikiAnalyzerFactory(request, + stem_language) + + if mode == 'update': + if debug: request.log("%s (replace %r)" % (pagename, uid)) + doc.uid = uid + id = writer.index(doc) + elif mode == 'add': + if debug: request.log("%s (add)" % (pagename,)) + id = writer.index(doc) + + from MoinMoin.action import AttachFile + + attachments = AttachFile._get_files(request, pagename) + for att in attachments: + filename = AttachFile.getFilename(request, pagename, att) + att_itemid = "%s//%s" % (itemid, att) + mtime = wikiutil.timestamp2version(os.path.getmtime(filename)) + if mode == 'update': + query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid)) + docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) + if debug: request.log("##%r %r" % (filename, docs)) + if docs: + doc = docs[0] # there should be only one + uid = doc['uid'] + docmtime = long(doc['values']['mtime']) + updated = mtime > docmtime + if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) + else: + uid = None + updated = True + elif mode == 'add': + updated = True + if debug: request.log("%s %s %r" % (pagename, att, updated)) + if updated: + xatt_itemid = xapdoc.Keyword('itemid', att_itemid) + xpname = xapdoc.SortKey('pagename', pagename) + xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename + xmtime = xapdoc.SortKey('mtime', mtime) + xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att)) + xlanguage = xapdoc.Keyword('lang', language) + xstem_language = xapdoc.Keyword('stem_lang', stem_language) + mimetype, att_content = self.contentfilter(filename) + xmimetype = xapdoc.TextField('mimetype', mimetype, True) + xcontent = xapdoc.TextField('content', att_content) + doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), + keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ), + sortFields=(xpname, xattachment, xmtime, xwname, ), + ) + doc.analyzerFactory = getWikiAnalyzerFactory(request, + stem_language) + if mode == 'update': + if debug: request.log("%s (replace %r)" % (pagename, uid)) + doc.uid = uid + id = writer.index(doc) + elif mode == 'add': + if debug: request.log("%s (add)" % (pagename,)) + id = writer.index(doc) + #writer.flush() + + + def _index_pages(self, request, lock=None, files=None, mode='update'): + """ Index all pages (and all given files) + + This should be called from indexPages or indexPagesInNewThread only! + + This may take some time, depending on the size of the wiki and speed + of the machine. + + When called in a new thread, lock is acquired before the call, + and this method must release it when it finishes or fails. + """ + try: + self._unsign() + start = time.time() + writer = xapidx.Index(self.dir, True) + writer.configure(self.prefixMap, self.indexValueMap) + pages = request.rootpage.getPageList(user='', exists=1) + request.log("indexing all (%d) pages..." % len(pages)) + for pagename in pages: + p = Page(request, pagename) + self._index_page(writer, p, mode) + if files: + request.log("indexing all files...") + for fname in files: + fname = fname.strip() + self._index_file(request, writer, fname, mode) + writer.close() + request.log("indexing completed successfully in %0.2f seconds." % + (time.time() - start)) + self._sign() + finally: + writer.__del__() + if lock: + lock.release() + + def _optimize(self, request): + """ Optimize the index """ + pass + + def _indexingRequest(self, request): + """ Return a new request that can be used for index building. + + This request uses a security policy that lets the current user + read any page. Without this policy some pages will not render, + which will create broken pagelinks index. + """ + from MoinMoin.request.CLI import Request + from MoinMoin.security import Permissions + request = Request(request.url) + class SecurityPolicy(Permissions): + def read(*args, **kw): + return True + request.user.may = SecurityPolicy(request.user) + return request + + def _unsign(self): + """ Remove sig file - assume write lock acquired """ + try: + os.remove(self.sig_file) + except OSError, err: + if err.errno != errno.ENOENT: + raise + + def _sign(self): + """ Add sig file - assume write lock acquired """ + f = file(self.sig_file, 'w') + try: + f.write('') + finally: + f.close() + + +def run_query(query, db): + enquire = xapian.Enquire(db) + parser = xapian.QueryParser() + query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD) + print query.get_description() + enquire.set_query(query) + return enquire.get_mset(0, 10) + +def run(request): + pass + #print "Begin" + #db = xapian.WritableDatabase(xapian.open('test.db', + # xapian.DB_CREATE_OR_OPEN)) + # + # index_data(db) ??? + #del db + #mset = run_query(sys.argv[1], db) + #print mset.get_matches_estimated() + #iterator = mset.begin() + #while iterator != mset.end(): + # print iterator.get_document().get_data() + # iterator.next() + #for i in xrange(1,170): + # doc = db.get_document(i) + # print doc.get_data() + +if __name__ == '__main__': + run() + + diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/search/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/search/__init__.py Tue Jun 27 15:09:46 2006 +0200 @@ -0,0 +1,27 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - search engine + + @copyright: 2005 MoinMoin:FlorianFesti, + 2005 MoinMoin:NirSoffer, + 2005 MoinMoin:AlexanderSchremmer, + 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz + @license: GNU GPL, see COPYING for details +""" + +from MoinMoin.search.queryparser import QueryParser +from MoinMoin.search.builtin import Search + +def searchPages(request, query, **kw): + """ Search the text of all pages for query. + + @param request: current request + @param query: the expression (string or query objects) we want to search for + @rtype: SearchResults instance + @return: search results + """ + if isinstance(query, str) or isinstance(query, unicode): + query = QueryParser(**kw).parse_query(query) + return Search(request, query).run() + diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/search/builtin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/search/builtin.py Tue Jun 27 15:09:46 2006 +0200 @@ -0,0 +1,159 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - search engine + + @copyright: 2005 MoinMoin:FlorianFesti, + 2005 MoinMoin:NirSoffer, + 2005 MoinMoin:AlexanderSchremmer, + 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz + @license: GNU GPL, see COPYING for details +""" + +import time, sys +from MoinMoin import wikiutil, config +from MoinMoin.Page import Page +from MoinMoin.search.results import FoundRemote, FoundPage, FoundAttachment, SearchResults + +try: + from MoinMoin.search import Xapian +except ImportError: + pass + + +############################################################################## +### Searching +############################################################################## + +class Search: + """ A search run """ + + def __init__(self, request, query): + self.request = request + self.query = query + self.filtered = False + self.fs_rootpage = "FS" # XXX FS hardcoded + + def run(self): + """ Perform search and return results object """ + start = time.time() + if self.request.cfg.xapian_search: + hits = self._xapianSearch() + else: + hits = self._moinSearch() + + # important - filter deleted pages or pages the user may not read! + if not self.filtered: + hits = self._filter(hits) + + result_hits = [] + for wikiname, page, attachment, match in hits: + if wikiname in (self.request.cfg.interwikiname, 'Self'): # a local match + if attachment: + result_hits.append(FoundAttachment(page.page_name, attachment)) + else: + result_hits.append(FoundPage(page.page_name, match)) + else: + result_hits.append(FoundRemote(wikiname, page, attachment, match)) + elapsed = time.time() - start + count = self.request.rootpage.getPageCount() + return SearchResults(self.query, result_hits, count, elapsed) + + # ---------------------------------------------------------------- + # Private! + + def _xapianSearch(self): + """ Search using Xapian + + Get a list of pages using fast xapian search and + return moin search in those pages. + """ + pages = None + try: + index = Xapian.Index(self.request) + except NameError: + index = None + if index and index.exists() and self.query.xapian_wanted(): + self.request.clock.start('_xapianSearch') + try: + from MoinMoin.support import xapwrap + query = self.query.xapian_term(self.request) + self.request.log("xapianSearch: query = %r" % + query.get_description()) + query = xapwrap.index.QObjQuery(query) + hits = index.search(query) + self.request.log("xapianSearch: finds: %r" % hits) + def dict_decode(d): + """ decode dict values to unicode """ + for k, v in d.items(): + d[k] = d[k].decode(config.charset) + return d + pages = [dict_decode(hit['values']) for hit in hits] + self.request.log("xapianSearch: finds pages: %r" % pages) + except index.LockedException: + pass + self.request.clock.stop('_xapianSearch') + return self._moinSearch(pages) + + def _moinSearch(self, pages=None): + """ Search pages using moin's built-in full text search + + Return list of tuples (page, match). The list may contain + deleted pages or pages the user may not read. + """ + self.request.clock.start('_moinSearch') + from MoinMoin.Page import Page + if pages is None: + # if we are not called from _xapianSearch, we make a full pagelist, + # but don't search attachments (thus attachment name = '') + pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()] + hits = [] + fs_rootpage = self.fs_rootpage + for valuedict in pages: + wikiname = valuedict['wikiname'] + pagename = valuedict['pagename'] + attachment = valuedict['attachment'] + if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki + page = Page(self.request, pagename) + if attachment: + if pagename == fs_rootpage: # not really an attachment + page = Page(self.request, "%s/%s" % (fs_rootpage, attachment)) + hits.append((wikiname, page, None, None)) + else: + hits.append((wikiname, page, attachment, None)) + else: + match = self.query.search(page) + if match: + hits.append((wikiname, page, attachment, match)) + else: # other wiki + hits.append((wikiname, pagename, attachment, None)) + self.request.clock.stop('_moinSearch') + return hits + + def _getPageList(self): + """ Get list of pages to search in + + If the query has a page filter, use it to filter pages before + searching. If not, get a unfiltered page list. The filtering + will happen later on the hits, which is faster with current + slow storage. + """ + filter = self.query.pageFilter() + if filter: + # There is no need to filter the results again. + self.filtered = True + return self.request.rootpage.getPageList(filter=filter) + else: + return self.request.rootpage.getPageList(user='', exists=0) + + def _filter(self, hits): + """ Filter out deleted or acl protected pages """ + userMayRead = self.request.user.may.read + fs_rootpage = self.fs_rootpage + "/" + thiswiki = (self.request.cfg.interwikiname, 'Self') + filtered = [(wikiname, page, attachment, match) for wikiname, page, attachment, match in hits + if not wikiname in thiswiki or + page.exists() and userMayRead(page.page_name) or + page.page_name.startswith(fs_rootpage)] + return filtered + diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/search/queryparser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/search/queryparser.py Tue Jun 27 15:09:46 2006 +0200 @@ -0,0 +1,695 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - search engine query parser + + @copyright: 2005 MoinMoin:FlorianFesti, + 2005 MoinMoin:NirSoffer, + 2005 MoinMoin:AlexanderSchremmer, + 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz + @license: GNU GPL, see COPYING for details +""" + +import re, string +from MoinMoin import config +from MoinMoin.search.results import Match, TitleMatch, TextMatch + +try: + from MoinMoin.search import Xapian + from MoinMoin.search.Xapian import Query, UnicodeQuery +except ImportError: + pass + +############################################################################# +### query objects +############################################################################# + +class BaseExpression: + """ Base class for all search terms """ + + def __init__(self): + self.negated = 0 + + def __str__(self): + return unicode(self).encode(config.charset, 'replace') + + def negate(self): + """ Negate the result of this term """ + self.negated = 1 + + def pageFilter(self): + """ Return a page filtering function + + This function is used to filter page list before we search + it. Return a function that get a page name, and return bool. + + The default expression does not have any filter function and + return None. Sub class may define custom filter functions. + """ + return None + + def search(self, page): + """ Search a page + + Returns a list of Match objects or None if term didn't find + anything (vice versa if negate() was called). Terms containing + other terms must call this method to aggregate the results. + This Base class returns True (Match()) if not negated. + """ + if self.negated: + # XXX why? + return [Match()] + else: + return None + + def costs(self): + """ Return estimated time to calculate this term + + Number is relative to other terms and has no real unit. + It allows to do the fast searches first. + """ + return 0 + + def highlight_re(self): + """ Return a regular expression of what the term searches for + + Used to display the needle in the page. + """ + return '' + + def _build_re(self, pattern, use_re=False, case=False, stemmed=False): + """ Make a regular expression out of a text pattern """ + flags = case and re.U or (re.I | re.U) + if use_re: + try: + self.search_re = re.compile(pattern, flags) + except re.error: + pattern = re.escape(pattern) + self.pattern = pattern + self.search_re = re.compile(pattern, flags) + else: + self.pattern = pattern + else: + pattern = re.escape(pattern) + self.search_re = re.compile(pattern, flags) + self.pattern = pattern + + +class AndExpression(BaseExpression): + """ A term connecting several sub terms with a logical AND """ + + operator = ' ' + + def __init__(self, *terms): + self._subterms = list(terms) + self._costs = 0 + for t in self._subterms: + self._costs += t.costs() + self.negated = 0 + + def append(self, expression): + """ Append another term """ + self._subterms.append(expression) + self._costs += expression.costs() + + def subterms(self): + return self._subterms + + def costs(self): + return self._costs + + def __unicode__(self): + result = '' + for t in self._subterms: + result += self.operator + t + return u'[' + result[len(self.operator):] + u']' + + def pageFilter(self): + """ Return a page filtering function + + This function is used to filter page list before we search it. + + Return a function that gets a page name, and return bool, or None. + """ + # Sort terms by cost, then get all title searches + self.sortByCost() + terms = [term for term in self._subterms if isinstance(term, TitleSearch)] + if terms: + # Create and return a filter function + def filter(name): + """ A function that return True if all terms filter name """ + for term in terms: + filter = term.pageFilter() + if not filter(name): + return False + return True + return filter + + return None + + def sortByCost(self): + tmp = [(term.costs(), term) for term in self._subterms] + tmp.sort() + self._subterms = [item[1] for item in tmp] + + def search(self, page): + """ Search for each term, cheap searches first """ + self.sortByCost() + matches = [] + for term in self._subterms: + result = term.search(page) + if not result: + return None + matches.extend(result) + return matches + + def highlight_re(self): + result = [] + for s in self._subterms: + highlight_re = s.highlight_re() + if highlight_re: result.append(highlight_re) + + return '|'.join(result) + + def xapian_wanted(self): + wanted = True + for term in self._subterms: + wanted = wanted and term.xapian_wanted() + return wanted + + def xapian_term(self, request): + # sort negated terms + terms = [] + not_terms = [] + for term in self._subterms: + if not term.negated: + terms.append(term.xapian_term(request)) + else: + not_terms.append(term.xapian_term(request)) + + # prepare query for not negated terms + if len(terms) == 1: + t1 = Query(terms[0]) + else: + t1 = Query(Query.OP_AND, terms) + + # negated terms? + if not not_terms: + # no, just return query for not negated terms + return t1 + + # yes, link not negated and negated terms' query with a AND_NOT query + if len(not_terms) == 1: + t2 = Query(not_terms[0]) + else: + t2 = Query(Query.OP_OR, not_terms) + + return Query(Query.OP_AND_NOT, t1, t2) + + +class OrExpression(AndExpression): + """ A term connecting several sub terms with a logical OR """ + + operator = ' or ' + + def search(self, page): + """ Search page with terms, cheap terms first + + XXX Do we have any reason to sort here? we are not breaking out + of the search in any case. + """ + self.sortByCost() + matches = [] + for term in self._subterms: + result = term.search(page) + if result: + matches.extend(result) + return matches + + def xapian_term(self, request): + # XXX: negated terms managed by _moinSearch? + return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms]) + + +class TextSearch(BaseExpression): + """ A term that does a normal text search + + Both page content and the page title are searched, using an + additional TitleSearch term. + """ + + def __init__(self, pattern, use_re=False, case=False): + """ Init a text search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + self._pattern = unicode(pattern) + self.negated = 0 + self.use_re = use_re + self.case = case + self._build_re(self._pattern, use_re=use_re, case=case) + self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 10000 + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return u"(%s)" % self._pattern + + def search(self, page): + matches = [] + + # Search in page name + results = self.titlesearch.search(page) + if results: + matches.extend(results) + + # Search in page body + body = page.get_raw_body() + for match in self.search_re.finditer(body): + if page.request.cfg.xapian_stemming: + # somewhere in regular word + if body[match.start()] not in config.chars_upper and \ + body[match.start()-1] in config.chars_lower: + continue + + post = 0 + for c in body[match.end():]: + if c in config.chars_lower: + post += 1 + else: + break + + matches.append(TextMatch(start=match.start(), + end=match.end()+post)) + else: + matches.append(TextMatch(re_match=match)) + + # Decide what to do with the results. + if ((self.negated and matches) or + (not self.negated and not matches)): + return None + elif matches: + return matches + else: + return [] + + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self, request): + if self.use_re: + return None # xapian can't do regex search + else: + analyzer = Xapian.WikiAnalyzer(request=request, + language=request.cfg.language_default) + terms = self._pattern.split() + + # all parsed wikiwords, AND'ed + queries = [] + stemmed = [] + for t in terms: + if request.cfg.xapian_stemming: + # stemmed OR not stemmed + tmp = [] + for i in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, i)) + stemmed.append(i[1]) + t = tmp + else: + # just not stemmed + t = [UnicodeQuery(i) for i in analyzer.tokenize(t)] + queries.append(Query(Query.OP_AND, t)) + + if stemmed: + self._build_re(' '.join(stemmed), use_re=False, + case=self.case, stemmed=True) + + # titlesearch OR parsed wikiwords + return Query(Query.OP_OR, + (self.titlesearch.xapian_term(request), + Query(Query.OP_AND, queries))) + + +class TitleSearch(BaseExpression): + """ Term searches in pattern in page title only """ + + def __init__(self, pattern, use_re=False, case=False): + """ Init a title search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + self._pattern = unicode(pattern) + self.negated = 0 + self.use_re = use_re + self.case = case + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 100 + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return u"(%s)" % self._pattern + + def pageFilter(self): + """ Page filter function for single title search """ + def filter(name): + match = self.search_re.search(name) + if ((self.negated and match) or + (not self.negated and not match)): + return False + return True + return filter + + def search(self, page): + # Get matches in page name + matches = [] + for match in self.search_re.finditer(page.page_name): + if page.request.cfg.xapian_stemming: + # somewhere in regular word + if page.page_name[match.start()] not in config.chars_upper and \ + page.page_name[match.start()-1] in config.chars_lower: + continue + + post = 0 + for c in page.page_name[match.end():]: + if c in config.chars_lower: + post += 1 + else: + break + + matches.append(TitleMatch(start=match.start(), + end=match.end()+post)) + else: + matches.append(TitleMatch(re_match=match)) + + if ((self.negated and matches) or + (not self.negated and not matches)): + return None + elif matches: + return matches + else: + return [] + + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self, request): + if self.use_re: + return None # xapian doesn't support regex search + else: + analyzer = Xapian.WikiAnalyzer(request=request, + language=request.cfg.language_default) + terms = self._pattern.split() + terms = [list(analyzer.raw_tokenize(t)) for t in terms] + + # all parsed wikiwords, AND'ed + queries = [] + stemmed = [] + for t in terms: + if request.cfg.xapian_stemming: + # stemmed OR not stemmed + tmp = [] + for i in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' % + (Xapian.Index.prefixMap['title'], j) for j in i])) + stemmed.append(i[1]) + t = tmp + else: + # just not stemmed + t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i)) + for i in analyzer.tokenize(t)] + + queries.append(Query(Query.OP_AND, t)) + + if stemmed: + self._build_re(' '.join(stemmed), use_re=False, + case=self.case, stemmed=True) + + return Query(Query.OP_AND, queries) + + +class LinkSearch(BaseExpression): + """ Search the term in the pagelinks """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a link search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + # used for search in links + self._pattern = pattern + # used for search in text + self._textpattern = '(' + self._pattern.replace('/', '|') + ')' + self.negated = 0 + self.use_re = use_re + self.case = case + self.textsearch = TextSearch(self._textpattern, use_re=1, case=case) + self._build_re(unicode(pattern), use_re=use_re, case=case) + + def _build_re(self, pattern, use_re=False, case=False): + """ Make a regular expression out of a text pattern """ + flags = case and re.U or (re.I | re.U) + try: + if not use_re: + raise re.error + self.search_re = re.compile(pattern, flags) + self.static = False + except re.error: + self.pattern = pattern + self.static = True + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return u"(%s)" % self._textpattern + + def search(self, page): + # Get matches in page name + matches = [] + + Found = True + + for link in page.getPageLinks(page.request): + if ((self.static and self.pattern == link) or + (not self.static and self.search_re.match(link))): + break + else: + Found = False + + if Found: + # Search in page text + results = self.textsearch.search(page) + if results: + matches.extend(results) + else: #This happens e.g. for pages that use navigation macros + matches.append(TextMatch(0, 0)) + + # Decide what to do with the results. + if ((self.negated and matches) or + (not self.negated and not matches)): + return None + elif matches: + return matches + else: + return [] + + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self, request): + pattern = self.pattern + if self.use_re: + return None # xapian doesnt support regex search + else: + return UnicodeQuery('%s:%s' % + (Xapian.Index.prefixMap['linkto'], pattern)) + + +class LanguageSearch(BaseExpression): + """ Search the pages written in a language """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a language search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + # iso language code, always lowercase + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = case + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return [] + else: + return [Match()] + + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self, request): + pattern = self.pattern + if self.use_re: + return None # xapian doesnt support regex search + else: + self.xapian_called = True + return UnicodeQuery('%s%s' % + (Xapian.Index.prefixMap['lang'], pattern)) + + +############################################################################## +### Parse Query +############################################################################## + +class QueryParser: + """ + Converts a String into a tree of Query objects + using recursive top/down parsing + """ + + def __init__(self, **kw): + """ + @keyword titlesearch: treat all terms as title searches + @keyword case: do case sensitive search + @keyword regex: treat all terms as regular expressions + """ + self.titlesearch = kw.get('titlesearch', 0) + self.case = kw.get('case', 0) + self.regex = kw.get('regex', 0) + + def parse_query(self, query): + """ transform an string into a tree of Query objects """ + if isinstance(query, str): + query = query.decode(config.charset) + self._query = query + result = self._or_expression() + if result is None: + result = BaseExpression() + return result + + def _or_expression(self): + result = self._and_expression() + if self._query: + result = OrExpression(result) + while self._query: + q = self._and_expression() + if q: + result.append(q) + return result + + def _and_expression(self): + result = None + while not result and self._query: + result = self._single_term() + term = self._single_term() + if term: + result = AndExpression(result, term) + else: + return result + term = self._single_term() + while term: + result.append(term) + term = self._single_term() + return result + + def _single_term(self): + regex = (r'(?P-?)\s*(' + # leading '-' + r'(?P\(|\)|(or\b(?!$)))|' + # or, (, ) + r'(?P(\w+:)*)' + + r'(?P("[^"]+")|' + + r"('[^']+')|(\S+)))") # search word itself + self._query = self._query.strip() + match = re.match(regex, self._query, re.U) + if not match: + return None + self._query = self._query[match.end():] + ops = match.group("OPS") + if ops == '(': + result = self._or_expression() + if match.group("NEG"): result.negate() + return result + elif ops == ')': + return None + elif ops == 'or': + return None + modifiers = match.group('MOD').split(":")[:-1] + text = match.group('TERM') + if self.isQuoted(text): + text = text[1:-1] + + title_search = self.titlesearch + regex = self.regex + case = self.case + linkto = False + lang = False + + for m in modifiers: + if "title".startswith(m): + title_search = True + elif "regex".startswith(m): + regex = True + elif "case".startswith(m): + case = True + elif "linkto".startswith(m): + linkto = True + elif "language".startswith(m): + lang = True + + if lang: + obj = LanguageSearch(text, use_re=regex, case=False) + elif linkto: + obj = LinkSearch(text, use_re=regex, case=case) + elif title_search: + obj = TitleSearch(text, use_re=regex, case=case) + else: + obj = TextSearch(text, use_re=regex, case=case) + + if match.group("NEG"): + obj.negate() + return obj + + def isQuoted(self, text): + # Empty string '' is not considered quoted + if len(text) < 3: + return False + return (text.startswith('"') and text.endswith('"') or + text.startswith("'") and text.endswith("'")) + + + diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/search/results.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/search/results.py Tue Jun 27 15:09:46 2006 +0200 @@ -0,0 +1,642 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - search engine + + @copyright: 2005 MoinMoin:FlorianFesti, + 2005 MoinMoin:NirSoffer, + 2005 MoinMoin:AlexanderSchremmer, + 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz + @license: GNU GPL, see COPYING for details +""" + +import StringIO +from MoinMoin import config, wikiutil +from MoinMoin.Page import Page + +############################################################################ +### Results +############################################################################ + +class Match(object): + """ Base class for all Matches (found pieces of pages). + + This class represents a empty True value as returned from negated searches. + """ + # Default match weight + _weight = 1.0 + + def __init__(self, start=0, end=0, re_match=None): + self.re_match = re_match + if not re_match: + self._start = start + self._end = end + else: + self._start = self._end = 0 + + def __len__(self): + return self.end - self.start + + def __eq__(self, other): + equal = (self.__class__ == other.__class__ and + self.start == other.start and + self.end == other.end) + return equal + + def __ne__(self, other): + return not self.__eq__(other) + + def view(self): + return '' + + def weight(self): + return self._weight + + def _get_start(self): + if self.re_match: + return self.re_match.start() + return self._start + + def _get_end(self): + if self.re_match: + return self.re_match.end() + return self._end + + # object properties + start = property(_get_start) + end = property(_get_end) + + +class TextMatch(Match): + """ Represents a match in the page content """ + pass + + +class TitleMatch(Match): + """ Represents a match in the page title + + Has more weight as a match in the page content. + """ + # Matches in titles are much more important in wikis. This setting + # seems to make all pages that have matches in the title to appear + # before pages that their title does not match. + _weight = 100.0 + + +class AttachmentMatch(Match): + """ Represents a match in a attachment content + + Not used yet. + """ + pass + + +class FoundPage: + """ Represents a page in a search result """ + + def __init__(self, page_name, matches=None, page=None): + self.page_name = page_name + self.attachment = '' # this is not an attachment + self.page = page + if matches is None: + matches = [] + self._matches = matches + + def weight(self, unique=1): + """ returns how important this page is for the terms searched for + + Summarize the weight of all page matches + + @param unique: ignore identical matches + @rtype: int + @return: page weight + """ + weight = 0 + for match in self.get_matches(unique=unique): + weight += match.weight() + # More sophisticated things to be added, like increase + # weight of near matches. + return weight + + def add_matches(self, matches): + """ Add found matches """ + self._matches.extend(matches) + + def get_matches(self, unique=1, sort='start', type=Match): + """ Return all matches of type sorted by sort + + @param unique: return only unique matches (bool) + @param sort: match attribute to sort by (string) + @param type: type of match to return (Match or sub class) + @rtype: list + @return: list of matches + """ + if unique: + matches = self._unique_matches(type=type) + if sort == 'start': + # matches already sorted by match.start, finished. + return matches + else: + matches = self._matches + + # Filter by type and sort by sort using fast schwartzian + # transform. + if sort == 'start': + tmp = [(match.start, match) for match in matches + if instance(match, type)] + else: + tmp = [(match.weight(), match) for match in matches + if instance(match, type)] + tmp.sort() + if sort == 'weight': + tmp.reverse() + matches = [item[1] for item in tmp] + + return matches + + def _unique_matches(self, type=Match): + """ Get a list of unique matches of type + + The result is sorted by match.start, because its easy to remove + duplicates like this. + + @param type: type of match to return + @rtype: list + @return: list of matches of type, sorted by match.start + """ + # Filter by type and sort by match.start using fast schwartzian + # transform. + tmp = [(match.start, match) for match in self._matches + if isinstance(match, type)] + tmp.sort() + + if not len(tmp): + return [] + + # Get first match into matches list + matches = [tmp[0][1]] + + # Add the remaining ones of matches ignoring identical matches + for item in tmp[1:]: + if item[1] == matches[-1]: + continue + matches.append(item[1]) + + return matches + + +class FoundAttachment(FoundPage): + """ Represent an attachment in search results """ + + def __init__(self, page_name, attachment, matches=None, page=None): + self.page_name = page_name + self.attachment = attachment + self.page = page + if matches is None: + matches = [] + self._matches = matches + + def weight(self, unique=1): + return 1 + + def get_matches(self, unique=1, sort='start', type=Match): + return [] + + def _unique_matches(self, type=Match): + return [] + + +class FoundRemote(FoundPage): + """ Represent an attachment in search results """ + + def __init__(self, wikiname, page_name, attachment, matches=None, page=None): + self.wikiname = wikiname + self.page_name = page_name + self.attachment = attachment + self.page = page + if matches is None: + matches = [] + self._matches = matches + + def weight(self, unique=1): + return 1 + + def get_matches(self, unique=1, sort='start', type=Match): + return [] + + def _unique_matches(self, type=Match): + return [] + + +############################################################################ +### Search results formatting +############################################################################ + +class SearchResults: + """ Manage search results, supply different views + + Search results can hold valid search results and format them for + many requests, until the wiki content changes. + + For example, one might ask for full page list sorted from A to Z, + and then ask for the same list sorted from Z to A. Or sort results + by name and then by rank. + """ + # Public functions -------------------------------------------------- + + def __init__(self, query, hits, pages, elapsed): + self.query = query # the query + self.hits = hits # hits list + self.sort = None # hits are unsorted initially + self.pages = pages # number of pages in the wiki + self.elapsed = elapsed # search time + + def sortByWeight(self): + """ Sorts found pages by the weight of the matches """ + tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits] + tmp.sort() + tmp.reverse() + self.hits = [item[2] for item in tmp] + self.sort = 'weight' + + def sortByPagename(self): + """ Sorts a list of found pages alphabetical by page name """ + tmp = [(hit.page_name, hit) for hit in self.hits] + tmp.sort() + self.hits = [item[1] for item in tmp] + self.sort = 'page_name' + + def stats(self, request, formatter): + """ Return search statistics, formatted with formatter + + @param request: current request + @param formatter: formatter to use + @rtype: unicode + @return formatted statistics + """ + _ = request.getText + output = [ + formatter.paragraph(1), + formatter.text(_("%(hits)d results out of about %(pages)d pages.") % + {'hits': len(self.hits), 'pages': self.pages}), + u' (%s)' % formatter.text(_("%.2f seconds") % self.elapsed), + formatter.paragraph(0), + ] + return ''.join(output) + + def pageList(self, request, formatter, info=0, numbered=1): + """ Format a list of found pages + + @param request: current request + @param formatter: formatter to use + @param info: show match info in title + @param numbered: use numbered list for display + @rtype: unicode + @return formatted page list + """ + self._reset(request, formatter) + f = formatter + write = self.buffer.write + if numbered: + list = f.number_list + else: + list = f.bullet_list + + # Add pages formatted as list + if self.hits: + write(list(1)) + + for page in self.hits: + if page.attachment: + querydict = { + 'action': 'AttachFile', + 'do': 'get', + 'target': page.attachment, + } + else: + querydict = None + querystr = self.querystring(querydict) + + matchInfo = '' + if info: + matchInfo = self.formatInfo(f, page) + item = [ + f.listitem(1), + f.pagelink(1, page.page_name, querystr=querystr), + self.formatTitle(page), + f.pagelink(0, page.page_name), + matchInfo, + f.listitem(0), + ] + write(''.join(item)) + write(list(0)) + + return self.getvalue() + + def pageListWithContext(self, request, formatter, info=1, context=180, + maxlines=1): + """ Format a list of found pages with context + + The default parameter values will create Google-like search + results, as this is the most known search interface. Good + interface is familiar interface, so unless we have much better + solution (we don't), being like Google is the way. + + @param request: current request + @param formatter: formatter to use + @param info: show match info near the page link + @param context: how many characters to show around each match. + @param maxlines: how many contexts lines to show. + @rtype: unicode + @return formatted page list with context + """ + self._reset(request, formatter) + f = formatter + write = self.buffer.write + + # Add pages formatted as definition list + if self.hits: + write(f.definition_list(1)) + + for page in self.hits: + matchInfo = '' + if info: + matchInfo = self.formatInfo(f, page) + if page.attachment: + fmt_context = "" + querydict = { + 'action': 'AttachFile', + 'do': 'get', + 'target': page.attachment, + } + elif page.page_name.startswith('FS/'): # XXX FS hardcoded + fmt_context = "" + querydict = None + else: + fmt_context = self.formatContext(page, context, maxlines) + querydict = None + querystr = self.querystring(querydict) + item = [ + f.definition_term(1), + f.pagelink(1, page.page_name, querystr=querystr), + self.formatTitle(page), + f.pagelink(0, page.page_name), + matchInfo, + f.definition_term(0), + f.definition_desc(1), + fmt_context, + f.definition_desc(0), + ] + write(''.join(item)) + write(f.definition_list(0)) + + return self.getvalue() + + # Private ----------------------------------------------------------- + + # This methods are not meant to be used by clients and may change + # without notice. + + def formatContext(self, page, context, maxlines): + """ Format search context for each matched page + + Try to show first maxlines interesting matches context. + """ + f = self.formatter + if not page.page: + page.page = Page(self.request, page.page_name) + body = page.page.get_raw_body() + last = len(body) - 1 + lineCount = 0 + output = [] + + # Get unique text matches sorted by match.start, try to ignore + # matches in page header, and show the first maxlines matches. + # TODO: when we implement weight algorithm for text matches, we + # should get the list of text matches sorted by weight and show + # the first maxlines matches. + matches = page.get_matches(unique=1, sort='start', type=TextMatch) + i, start = self.firstInterestingMatch(page, matches) + + # Format context + while i < len(matches) and lineCount < maxlines: + match = matches[i] + + # Get context range for this match + start, end = self.contextRange(context, match, start, last) + + # Format context lines for matches. Each complete match in + # the context will be highlighted, and if the full match is + # in the context, we increase the index, and will not show + # same match again on a separate line. + + output.append(f.text(u'...')) + + # Get the index of the first match completely within the + # context. + for j in xrange(0, len(matches)): + if matches[j].start >= start: + break + + # Add all matches in context and the text between them + while True: + match = matches[j] + # Ignore matches behind the current position + if start < match.end: + # Append the text before match + if start < match.start: + output.append(f.text(body[start:match.start])) + # And the match + output.append(self.formatMatch(body, match, start)) + start = match.end + # Get next match, but only if its completely within the context + if j < len(matches) - 1 and matches[j + 1].end <= end: + j += 1 + else: + break + + # Add text after last match and finish the line + if match.end < end: + output.append(f.text(body[match.end:end])) + output.append(f.text(u'...')) + output.append(f.linebreak(preformatted=0)) + + # Increase line and point to the next match + lineCount += 1 + i = j + 1 + + output = ''.join(output) + + if not output: + # Return the first context characters from the page text + output = f.text(page.page.getPageText(length=context)) + output = output.strip() + if not output: + # This is a page with no text, only header, for example, + # a redirect page. + output = f.text(page.page.getPageHeader(length=context)) + + return output + + def firstInterestingMatch(self, page, matches): + """ Return the first interesting match + + This function is needed only because we don't have yet a weight + algorithm for page text matches. + + Try to find the first match in the page text. If we can't find + one, we return the first match and start=0. + + @rtype: tuple + @return: index of first match, start of text + """ + header = page.page.getPageHeader() + start = len(header) + # Find first match after start + for i in xrange(len(matches)): + if matches[i].start >= start: + return i, start + return 0, 0 + + def contextRange(self, context, match, start, last): + """ Compute context range + + Add context around each match. If there is no room for context + before or after the match, show more context on the other side. + + @param context: context length + @param match: current match + @param start: context should not start before that index, unless + end is past the last character. + @param last: last character index + @rtype: tuple + @return: start, end of context + """ + # Start by giving equal context on both sides of match + contextlen = max(context - len(match), 0) + cstart = match.start - contextlen / 2 + cend = match.end + contextlen / 2 + + # If context start before start, give more context on end + if cstart < start: + cend += start - cstart + cstart = start + + # But if end if after last, give back context to start + if cend > last: + cstart -= cend - last + cend = last + + # Keep context start positive for very short texts + cstart = max(cstart, 0) + + return cstart, cend + + def formatTitle(self, page): + """ Format page title + + Invoke format match on all unique matches in page title. + + @param page: found page + @rtype: unicode + @return: formatted title + """ + # Get unique title matches sorted by match.start + matches = page.get_matches(unique=1, sort='start', type=TitleMatch) + + # Format + pagename = page.page_name + f = self.formatter + output = [] + start = 0 + for match in matches: + # Ignore matches behind the current position + if start < match.end: + # Append the text before the match + if start < match.start: + output.append(f.text(pagename[start:match.start])) + # And the match + output.append(self.formatMatch(pagename, match, start)) + start = match.end + # Add text after match + if start < len(pagename): + output.append(f.text(pagename[start:])) + + if page.attachment: # show the attachment that matched + output.extend([ + " ", + f.strong(1), + f.text("(%s)" % page.attachment), + f.strong(0)]) + + return ''.join(output) + + def formatMatch(self, body, match, location): + """ Format single match in text + + Format the part of the match after the current location in the + text. Matches behind location are ignored and an empty string is + returned. + + @param body: text containing match + @param match: search match in text + @param location: current location in text + @rtype: unicode + @return: formatted match or empty string + """ + start = max(location, match.start) + if start < match.end: + f = self.formatter + output = [ + f.strong(1), + f.text(body[start:match.end]), + f.strong(0), + ] + return ''.join(output) + return '' + + def querystring(self, querydict=None): + """ Return query string, used in the page link """ + if querydict is None: + querydict = {'highlight': self.query.highlight_re()} + querystr = wikiutil.makeQueryString(querydict) + #querystr = wikiutil.escape(querystr) + return querystr + + def formatInfo(self, formatter, page): + """ Return formatted match info """ + template = u' . . . %s %s' + template = u"%s%s%s" % (formatter.span(1, css_class="info"), + template, + formatter.span(0)) + # Count number of unique matches in text of all types + count = len(page.get_matches(unique=1)) + info = template % (count, self.matchLabel[count != 1]) + return info + + def getvalue(self): + """ Return output in div with CSS class """ + write = self.request.write + value = [ + self.formatter.div(1, css_class='searchresults'), + self.buffer.getvalue(), + self.formatter.div(0), + ] + return '\n'.join(value) + + def _reset(self, request, formatter): + """ Update internal state before new output + + Do not call this, it should be called only by the instance code. + + Each request might need different translations or other user + preferences. + """ + self.buffer = StringIO.StringIO() + self.formatter = formatter + self.request = request + # Use 1 match, 2 matches... + _ = request.getText + self.matchLabel = (_('match'), _('matches')) + + diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/Xapian.py --- a/MoinMoin/Xapian.py Tue Jun 27 13:58:39 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,771 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - xapian indexing search engine - - @copyright: 2006 MoinMoin:ThomasWaldmann, - 2006 MoinMoin:FranzPletz - @license: GNU GPL, see COPYING for details. -""" -debug = True - -import sys, os, re, codecs, errno, time -from pprint import pprint - -import xapian -from xapian import Query -from MoinMoin.support.xapwrap import document as xapdoc -from MoinMoin.support.xapwrap import index as xapidx -from MoinMoin.parser.text_moin_wiki import Parser as WikiParser - -from MoinMoin.Page import Page -from MoinMoin import config, wikiutil -from MoinMoin.util import filesys, lock - -try: - # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ - from Stemmer import Stemmer -except ImportError: - Stemmer = None - -class UnicodeQuery(xapian.Query): - def __init__(self, *args, **kwargs): - self.encoding = kwargs.get('encoding', config.charset) - - nargs = [] - for term in args: - if isinstance(term, unicode): - term = term.encode(self.encoding) - elif isinstance(term, list) or isinstance(term, tuple): - term = [t.encode(self.encoding) for t in term] - nargs.append(term) - - xapian.Query.__init__(self, *nargs, **kwargs) - - -############################################################################## -### Tokenizer -############################################################################## - -def getWikiAnalyzerFactory(request=None, language='en'): - return (lambda: WikiAnalyzer(request, language)) - -class WikiAnalyzer: - singleword = r"[%(u)s][%(l)s]+" % { - 'u': config.chars_upper, - 'l': config.chars_lower, - } - - singleword_re = re.compile(singleword, re.U) - wikiword_re = re.compile(WikiParser.word_rule, re.U) - - token_re = re.compile( - r"(?P\w+[&@]\w+)|" + # company names like AT&T and Excite@Home. - r"(?P\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses - r"(?P\w+(\.\w+)+)|" + # hostnames - r"(?P(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers - r"(?P(\w\.)+)|" + # acronyms: U.S.A., I.B.M., etc. - r"(?P\w+)", # words (including WikiWords) - re.U) - - dot_re = re.compile(r"[-_/,.]") - mail_re = re.compile(r"[-_/,.]|(@)") - - # XXX limit stuff above to xapdoc.MAX_KEY_LEN - # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) - - def __init__(self, request=None, language=None): - if request and request.cfg.xapian_stemming and language: - self.stemmer = Stemmer(language) - else: - self.stemmer = None - - def raw_tokenize(self, value): - def enc(uc): - """ 'encode' unicode results into whatever xapian / xapwrap wants """ - lower = uc.lower() - return lower - - if isinstance(value, list): # used for page links - for v in value: - yield enc(v) - else: - tokenstream = re.finditer(self.token_re, value) - for m in tokenstream: - if m.group("acronym"): - yield enc(m.group("acronym").replace('.', '')) - elif m.group("company"): - yield enc(m.group("company")) - elif m.group("email"): - for word in self.mail_re.split(m.group("email")): - if word: - yield enc(word) - elif m.group("hostname"): - for word in self.dot_re.split(m.group("hostname")): - yield enc(word) - elif m.group("num"): - for word in self.dot_re.split(m.group("num")): - yield enc(word) - elif m.group("word"): - word = m.group("word") - yield enc(word) - # if it is a CamelCaseWord, we additionally yield Camel, Case and Word - if self.wikiword_re.match(word): - for sm in re.finditer(self.singleword_re, word): - yield enc(sm.group()) - - def tokenize(self, value, flat_stemming=True): - """Yield a stream of lower cased raw and stemmed (optional) words from a string. - value must be an UNICODE object or a list of unicode objects - """ - for i in self.raw_tokenize(value): - if flat_stemming: - yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i - if self.stemmer: - yield self.stemmer.stemWord(i) - else: - yield (i, self.stemmer.stemWord(i)) - - -############################################################################# -### Indexing -############################################################################# - -class UpdateQueue: - def __init__(self, file, lock_dir): - self.file = file - self.writeLock = lock.WriteLock(lock_dir, timeout=10.0) - self.readLock = lock.ReadLock(lock_dir, timeout=10.0) - - def exists(self): - return os.path.exists(self.file) - - def append(self, pagename): - """ Append a page to queue """ - if not self.writeLock.acquire(60.0): - request.log("can't add %r to xapian update queue: can't lock queue" % - pagename) - return - try: - f = codecs.open(self.file, 'a', config.charset) - try: - f.write(pagename + "\n") - finally: - f.close() - finally: - self.writeLock.release() - - def pages(self): - """ Return list of pages in the queue """ - if self.readLock.acquire(1.0): - try: - return self._decode(self._read()) - finally: - self.readLock.release() - return [] - - def remove(self, pages): - """ Remove pages from the queue - - When the queue is empty, the queue file is removed, so exists() - can tell if there is something waiting in the queue. - """ - if self.writeLock.acquire(30.0): - try: - queue = self._decode(self._read()) - for page in pages: - try: - queue.remove(page) - except ValueError: - pass - if queue: - self._write(queue) - else: - self._removeFile() - return True - finally: - self.writeLock.release() - return False - - # Private ------------------------------------------------------- - - def _decode(self, data): - """ Decode queue data """ - pages = data.splitlines() - return self._filterDuplicates(pages) - - def _filterDuplicates(self, pages): - """ Filter duplicates in page list, keeping the order """ - unique = [] - seen = {} - for name in pages: - if not name in seen: - unique.append(name) - seen[name] = 1 - return unique - - def _read(self): - """ Read and return queue data - - This does not do anything with the data so we can release the - lock as soon as possible, enabling others to update the queue. - """ - try: - f = codecs.open(self.file, 'r', config.charset) - try: - return f.read() - finally: - f.close() - except (OSError, IOError), err: - if err.errno != errno.ENOENT: - raise - return '' - - def _write(self, pages): - """ Write pages to queue file - - Requires queue write locking. - """ - # XXX use tmpfile/move for atomic replace on real operating systems - data = '\n'.join(pages) + '\n' - f = codecs.open(self.file, 'w', config.charset) - try: - f.write(data) - finally: - f.close() - - def _removeFile(self): - """ Remove queue file - - Requires queue write locking. - """ - try: - os.remove(self.file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - -class Index: - indexValueMap = { - # mapping the value names we can easily fetch from the index to - # integers required by xapian. 0 and 1 are reserved by xapwrap! - 'pagename': 2, - 'attachment': 3, - 'mtime': 4, - 'wikiname': 5, - } - prefixMap = { - # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt - 'author': 'A', - 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest - #G newsGroup (or similar entity - e.g. a web forum name) - 'hostname': 'H', - 'keyword': 'K', - 'lang': 'L', # ISO Language code - #M Month (numeric format: YYYYMM) - #N ISO couNtry code (or domaiN name) - #P Pathname - #Q uniQue id - 'raw': 'R', # Raw (i.e. unstemmed) term - 'title': 'S', # Subject (or title) - 'mimetype': 'T', - 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 - # characters, a hashing scheme is used to prevent overflowing - # the Xapian term length limit (see omindex for how to do this). - #W "weak" (approximately 10 day intervals, taken as YYYYMMD from - # the D term, and changing the last digit to a '2' if it's a '3') - #X longer prefix for user-defined use - 'linkto': 'XLINKTO', # this document links to that document - 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in - #Y year (four digits) - } - - class LockedException(Exception): - pass - - def __init__(self, request): - self.request = request - cache_dir = request.cfg.cache_dir - main_dir = self._main_dir() - self.dir = os.path.join(main_dir, 'index') - filesys.makeDirs(self.dir) - self.sig_file = os.path.join(main_dir, 'complete') - lock_dir = os.path.join(main_dir, 'index-lock') - self.lock = lock.WriteLock(lock_dir, - timeout=3600.0, readlocktimeout=60.0) - self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) - self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), - os.path.join(main_dir, 'update-queue-lock')) - - # Disabled until we have a sane way to build the index with a - # queue in small steps. - ## if not self.exists(): - ## self.indexPagesInNewThread(request) - - # Check if we should and can stem words - if request.cfg.xapian_stemming and not Stemmer: - request.cfg.xapian_stemming = False - - def _main_dir(self): - if self.request.cfg.xapian_index_dir: - return os.path.join(self.request.cfg.xapian_index_dir, - self.request.cfg.siteid) - else: - return os.path.join(self.request.cfg.cache_dir, 'xapian') - - def exists(self): - """ Check if index exists """ - return os.path.exists(self.sig_file) - - def mtime(self): - return os.path.getmtime(self.dir) - - def _search(self, query): - """ read lock must be acquired """ - while True: - try: - searcher, timestamp = self.request.cfg.xapian_searchers.pop() - if timestamp != self.mtime(): - searcher.close() - else: - break - except IndexError: - searcher = xapidx.ReadOnlyIndex(self.dir) - searcher.configure(self.prefixMap, self.indexValueMap) - timestamp = self.mtime() - break - - hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname']) - self.request.cfg.xapian_searchers.append((searcher, timestamp)) - return hits - - def search(self, query): - if not self.read_lock.acquire(1.0): - raise self.LockedException - try: - hits = self._search(query) - finally: - self.read_lock.release() - return hits - - def update_page(self, page): - self.queue.append(page.page_name) - self._do_queued_updates_InNewThread() - - def indexPages(self, files=None, mode='update'): - """ Index all pages (and files, if given) - - Can be called only from a script. To index pages during a user - request, use indexPagesInNewThread. - @arg files: iterator or list of files to index additionally - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - request = self._indexingRequest(self.request) - self._index_pages(request, None, files, mode) - finally: - self.lock.release() - - def indexPagesInNewThread(self, files=None, mode='update'): - """ Index all pages in a new thread - - Should be called from a user request. From a script, use indexPages. - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - # Prevent rebuilding the index just after it was finished - if self.exists(): - self.lock.release() - return - from threading import Thread - indexThread = Thread(target=self._index_pages, - args=(self._indexingRequest(self.request), self.lock, files, mode)) - indexThread.setDaemon(True) - - # Join the index thread after current request finish, prevent - # Apache CGI from killing the process. - def joinDecorator(finish): - def func(): - finish() - indexThread.join() - return func - - self.request.finish = joinDecorator(self.request.finish) - indexThread.start() - except: - self.lock.release() - raise - - def optimize(self): - pass - - # Private ---------------------------------------------------------------- - - def _do_queued_updates_InNewThread(self): - """ do queued index updates in a new thread - - Should be called from a user request. From a script, use indexPages. - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - from threading import Thread - indexThread = Thread(target=self._do_queued_updates, - args=(self._indexingRequest(self.request), self.lock)) - indexThread.setDaemon(True) - - # Join the index thread after current request finish, prevent - # Apache CGI from killing the process. - def joinDecorator(finish): - def func(): - finish() - indexThread.join() - return func - - self.request.finish = joinDecorator(self.request.finish) - indexThread.start() - except: - self.lock.release() - raise - - def _do_queued_updates(self, request, lock=None, amount=5): - """ Assumes that the write lock is acquired """ - try: - writer = xapidx.Index(self.dir, True) - writer.configure(self.prefixMap, self.indexValueMap) - pages = self.queue.pages()[:amount] - for name in pages: - p = Page(request, name) - self._index_page(writer, p, mode='update') - self.queue.remove([name]) - finally: - writer.close() - if lock: - lock.release() - - def contentfilter(self, filename): - """ Get a filter for content of filename and return unicode content. """ - request = self.request - mt = wikiutil.MimeType(filename=filename) - for modulename in mt.module_name(): - try: - execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) - break - except wikiutil.PluginMissingError: - pass - else: - request.log("Cannot load filter for mimetype." + modulename) - try: - data = execute(self, filename) - if debug: - request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename)) - except (OSError, IOError), err: - data = '' - request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename)) - return mt.mime_type(), data - - def test(self, request): - idx = xapidx.ReadOnlyIndex(self.dir) - idx.configure(self.prefixMap, self.indexValueMap) - print idx.search("is") - #for d in docs: - # request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) - - def _index_file(self, request, writer, filename, mode='update'): - """ index a file as it were a page named pagename - Assumes that the write lock is acquired - """ - fs_rootpage = 'FS' # XXX FS hardcoded - try: - wikiname = request.cfg.interwikiname or 'Self' - itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename)) - mtime = os.path.getmtime(filename) - mtime = wikiutil.timestamp2version(mtime) - if mode == 'update': - query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) - docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) - if docs: - doc = docs[0] # there should be only one - uid = doc['uid'] - docmtime = long(doc['values']['mtime']) - updated = mtime > docmtime - if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) - else: - uid = None - updated = True - elif mode == 'add': - updated = True - if debug: request.log("%s %r" % (filename, updated)) - if updated: - xitemid = xapdoc.Keyword('itemid', itemid) - mimetype, file_content = self.contentfilter(filename) - xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self") - xpname = xapdoc.SortKey('pagename', fs_rootpage) - xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments - xmtime = xapdoc.SortKey('mtime', mtime) - title = " ".join(os.path.join(fs_rootpage, filename).split("/")) - xtitle = xapdoc.Keyword('title', title) - xmimetype = xapdoc.TextField('mimetype', mimetype, True) - xcontent = xapdoc.TextField('content', file_content) - doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xtitle, xitemid, ), - sortFields=(xpname, xattachment, xmtime, xwname, ), - ) - doc.analyzerFactory = getWikiAnalyzerFactory() - if mode == 'update': - if debug: request.log("%s (replace %r)" % (filename, uid)) - doc.uid = uid - id = writer.index(doc) - elif mode == 'add': - if debug: request.log("%s (add)" % (filename,)) - id = writer.index(doc) - except (OSError, IOError), err: - pass - - def _get_languages(self, page): - body = page.get_raw_body() - default_lang = page.request.cfg.language_default - - lang = '' - - if page.request.cfg.xapian_stemming: - for line in body.split('\n'): - if line.startswith('#language'): - lang = line.split(' ')[1] - try: - Stemmer(lang) - except KeyError: - # lang is not stemmable - break - else: - # lang is stemmable - return (lang, lang) - elif not line.startswith('#'): - break - - if not lang: - # no lang found at all.. fallback to default language - lang = default_lang - - # return actual lang and lang to stem in - return (lang, default_lang) - - def _index_page(self, writer, page, mode='update'): - """ Index a page - assumes that the write lock is acquired - @arg writer: the index writer object - @arg page: a page object - @arg mode: 'add' = just add, no checks - 'update' = check if already in index and update if needed (mtime) - - """ - request = page.request - wikiname = request.cfg.interwikiname or "Self" - pagename = page.page_name - mtime = page.mtime_usecs() - itemid = "%s:%s" % (wikiname, pagename) - # XXX: Hack until we get proper metadata - language, stem_language = self._get_languages(page) - updated = False - - if mode == 'update': - # from #xapian: if you generate a special "unique id" term, - # you can just call database.replace_document(uid_term, doc) - # -> done in xapwrap.index.Index.index() - query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) - docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) - if docs: - doc = docs[0] # there should be only one - uid = doc['uid'] - docmtime = long(doc['values']['mtime']) - updated = mtime > docmtime - if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) - else: - uid = None - updated = True - elif mode == 'add': - updated = True - if debug: request.log("%s %r" % (pagename, updated)) - if updated: - xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self") - xpname = xapdoc.SortKey('pagename', pagename) - xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment - xmtime = xapdoc.SortKey('mtime', mtime) - xtitle = xapdoc.TextField('title', pagename, True) # prefixed - xkeywords = [xapdoc.Keyword('itemid', itemid), - xapdoc.Keyword('lang', language), - xapdoc.Keyword('stem_lang', stem_language)] - for pagelink in page.getPageLinks(request): - xkeywords.append(xapdoc.Keyword('linkto', pagelink)) - xcontent = xapdoc.TextField('content', page.get_raw_body()) - doc = xapdoc.Document(textFields=(xcontent, xtitle), - keywords=xkeywords, - sortFields=(xpname, xattachment, xmtime, xwname, ), - ) - doc.analyzerFactory = getWikiAnalyzerFactory(request, - stem_language) - - if mode == 'update': - if debug: request.log("%s (replace %r)" % (pagename, uid)) - doc.uid = uid - id = writer.index(doc) - elif mode == 'add': - if debug: request.log("%s (add)" % (pagename,)) - id = writer.index(doc) - - from MoinMoin.action import AttachFile - - attachments = AttachFile._get_files(request, pagename) - for att in attachments: - filename = AttachFile.getFilename(request, pagename, att) - att_itemid = "%s//%s" % (itemid, att) - mtime = wikiutil.timestamp2version(os.path.getmtime(filename)) - if mode == 'update': - query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid)) - docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) - if debug: request.log("##%r %r" % (filename, docs)) - if docs: - doc = docs[0] # there should be only one - uid = doc['uid'] - docmtime = long(doc['values']['mtime']) - updated = mtime > docmtime - if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) - else: - uid = None - updated = True - elif mode == 'add': - updated = True - if debug: request.log("%s %s %r" % (pagename, att, updated)) - if updated: - xatt_itemid = xapdoc.Keyword('itemid', att_itemid) - xpname = xapdoc.SortKey('pagename', pagename) - xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename - xmtime = xapdoc.SortKey('mtime', mtime) - xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att)) - xlanguage = xapdoc.Keyword('lang', language) - xstem_language = xapdoc.Keyword('stem_lang', stem_language) - mimetype, att_content = self.contentfilter(filename) - xmimetype = xapdoc.TextField('mimetype', mimetype, True) - xcontent = xapdoc.TextField('content', att_content) - doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ), - sortFields=(xpname, xattachment, xmtime, xwname, ), - ) - doc.analyzerFactory = getWikiAnalyzerFactory(request, - stem_language) - if mode == 'update': - if debug: request.log("%s (replace %r)" % (pagename, uid)) - doc.uid = uid - id = writer.index(doc) - elif mode == 'add': - if debug: request.log("%s (add)" % (pagename,)) - id = writer.index(doc) - #writer.flush() - - - def _index_pages(self, request, lock=None, files=None, mode='update'): - """ Index all pages (and all given files) - - This should be called from indexPages or indexPagesInNewThread only! - - This may take some time, depending on the size of the wiki and speed - of the machine. - - When called in a new thread, lock is acquired before the call, - and this method must release it when it finishes or fails. - """ - try: - self._unsign() - start = time.time() - writer = xapidx.Index(self.dir, True) - writer.configure(self.prefixMap, self.indexValueMap) - pages = request.rootpage.getPageList(user='', exists=1) - request.log("indexing all (%d) pages..." % len(pages)) - for pagename in pages: - p = Page(request, pagename) - self._index_page(writer, p, mode) - if files: - request.log("indexing all files...") - for fname in files: - fname = fname.strip() - self._index_file(request, writer, fname, mode) - writer.close() - request.log("indexing completed successfully in %0.2f seconds." % - (time.time() - start)) - self._sign() - finally: - writer.__del__() - if lock: - lock.release() - - def _optimize(self, request): - """ Optimize the index """ - pass - - def _indexingRequest(self, request): - """ Return a new request that can be used for index building. - - This request uses a security policy that lets the current user - read any page. Without this policy some pages will not render, - which will create broken pagelinks index. - """ - from MoinMoin.request.CLI import Request - from MoinMoin.security import Permissions - request = Request(request.url) - class SecurityPolicy(Permissions): - def read(*args, **kw): - return True - request.user.may = SecurityPolicy(request.user) - return request - - def _unsign(self): - """ Remove sig file - assume write lock acquired """ - try: - os.remove(self.sig_file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - def _sign(self): - """ Add sig file - assume write lock acquired """ - f = file(self.sig_file, 'w') - try: - f.write('') - finally: - f.close() - - -def run_query(query, db): - enquire = xapian.Enquire(db) - parser = xapian.QueryParser() - query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD) - print query.get_description() - enquire.set_query(query) - return enquire.get_mset(0, 10) - -def run(request): - pass - #print "Begin" - #db = xapian.WritableDatabase(xapian.open('test.db', - # xapian.DB_CREATE_OR_OPEN)) - # - # index_data(db) ??? - #del db - #mset = run_query(sys.argv[1], db) - #print mset.get_matches_estimated() - #iterator = mset.begin() - #while iterator != mset.end(): - # print iterator.get_document().get_data() - # iterator.next() - #for i in xrange(1,170): - # doc = db.get_document(i) - # print doc.get_data() - -if __name__ == '__main__': - run() - - diff -r d9bd5d6ae30d -r 5469c8b911a4 MoinMoin/search.py --- a/MoinMoin/search.py Tue Jun 27 13:58:39 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1473 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - search engine - - @copyright: 2005 MoinMoin:FlorianFesti, - 2005 MoinMoin:NirSoffer, - 2005 MoinMoin:AlexanderSchremmer, - 2006 MoinMoin:ThomasWaldmann, - 2006 MoinMoin:FranzPletz - @license: GNU GPL, see COPYING for details -""" - -import re, time, sys, StringIO, string, operator -from sets import Set -from MoinMoin import wikiutil, config -from MoinMoin.Page import Page - -try: - import Xapian - from Xapian import Query, UnicodeQuery -except ImportError: - pass - -############################################################################# -### query objects -############################################################################# - -class BaseExpression: - """ Base class for all search terms """ - - def __init__(self): - self.negated = 0 - - def __str__(self): - return unicode(self).encode(config.charset, 'replace') - - def negate(self): - """ Negate the result of this term """ - self.negated = 1 - - def pageFilter(self): - """ Return a page filtering function - - This function is used to filter page list before we search - it. Return a function that get a page name, and return bool. - - The default expression does not have any filter function and - return None. Sub class may define custom filter functions. - """ - return None - - def search(self, page): - """ Search a page - - Returns a list of Match objects or None if term didn't find - anything (vice versa if negate() was called). Terms containing - other terms must call this method to aggregate the results. - This Base class returns True (Match()) if not negated. - """ - if self.negated: - # XXX why? - return [Match()] - else: - return None - - def costs(self): - """ Return estimated time to calculate this term - - Number is relative to other terms and has no real unit. - It allows to do the fast searches first. - """ - return 0 - - def highlight_re(self): - """ Return a regular expression of what the term searches for - - Used to display the needle in the page. - """ - return '' - - def _build_re(self, pattern, use_re=False, case=False, stemmed=False): - """ Make a regular expression out of a text pattern """ - flags = case and re.U or (re.I | re.U) - if use_re: - try: - self.search_re = re.compile(pattern, flags) - except re.error: - pattern = re.escape(pattern) - self.pattern = pattern - self.search_re = re.compile(pattern, flags) - else: - self.pattern = pattern - else: - pattern = re.escape(pattern) - self.search_re = re.compile(pattern, flags) - self.pattern = pattern - - -class AndExpression(BaseExpression): - """ A term connecting several sub terms with a logical AND """ - - operator = ' ' - - def __init__(self, *terms): - self._subterms = list(terms) - self._costs = 0 - for t in self._subterms: - self._costs += t.costs() - self.negated = 0 - - def append(self, expression): - """ Append another term """ - self._subterms.append(expression) - self._costs += expression.costs() - - def subterms(self): - return self._subterms - - def costs(self): - return self._costs - - def __unicode__(self): - result = '' - for t in self._subterms: - result += self.operator + t - return u'[' + result[len(self.operator):] + u']' - - def pageFilter(self): - """ Return a page filtering function - - This function is used to filter page list before we search it. - - Return a function that gets a page name, and return bool, or None. - """ - # Sort terms by cost, then get all title searches - self.sortByCost() - terms = [term for term in self._subterms if isinstance(term, TitleSearch)] - if terms: - # Create and return a filter function - def filter(name): - """ A function that return True if all terms filter name """ - for term in terms: - filter = term.pageFilter() - if not filter(name): - return False - return True - return filter - - return None - - def sortByCost(self): - tmp = [(term.costs(), term) for term in self._subterms] - tmp.sort() - self._subterms = [item[1] for item in tmp] - - def search(self, page): - """ Search for each term, cheap searches first """ - self.sortByCost() - matches = [] - for term in self._subterms: - result = term.search(page) - if not result: - return None - matches.extend(result) - return matches - - def highlight_re(self): - result = [] - for s in self._subterms: - highlight_re = s.highlight_re() - if highlight_re: result.append(highlight_re) - - return '|'.join(result) - - def xapian_wanted(self): - wanted = True - for term in self._subterms: - wanted = wanted and term.xapian_wanted() - return wanted - - def xapian_term(self, request): - # sort negated terms - terms = [] - not_terms = [] - for term in self._subterms: - if not term.negated: - terms.append(term.xapian_term(request)) - else: - not_terms.append(term.xapian_term(request)) - - # prepare query for not negated terms - if len(terms) == 1: - t1 = Query(terms[0]) - else: - t1 = Query(Query.OP_AND, terms) - - # negated terms? - if not not_terms: - # no, just return query for not negated terms - return t1 - - # yes, link not negated and negated terms' query with a AND_NOT query - if len(not_terms) == 1: - t2 = Query(not_terms[0]) - else: - t2 = Query(Query.OP_OR, not_terms) - - return Query(Query.OP_AND_NOT, t1, t2) - - -class OrExpression(AndExpression): - """ A term connecting several sub terms with a logical OR """ - - operator = ' or ' - - def search(self, page): - """ Search page with terms, cheap terms first - - XXX Do we have any reason to sort here? we are not breaking out - of the search in any case. - """ - self.sortByCost() - matches = [] - for term in self._subterms: - result = term.search(page) - if result: - matches.extend(result) - return matches - - def xapian_term(self, request): - # XXX: negated terms managed by _moinSearch? - return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms]) - - -class TextSearch(BaseExpression): - """ A term that does a normal text search - - Both page content and the page title are searched, using an - additional TitleSearch term. - """ - - def __init__(self, pattern, use_re=False, case=False): - """ Init a text search - - @param pattern: pattern to search for, ascii string or unicode - @param use_re: treat pattern as re of plain text, bool - @param case: do case sensitive search, bool - """ - self._pattern = unicode(pattern) - self.negated = 0 - self.use_re = use_re - self.case = case - self._build_re(self._pattern, use_re=use_re, case=case) - self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case) - - def costs(self): - return 10000 - - def __unicode__(self): - neg = self.negated and '-' or '' - return u'%s"%s"' % (neg, unicode(self._pattern)) - - def highlight_re(self): - return u"(%s)" % self._pattern - - def search(self, page): - matches = [] - - # Search in page name - results = self.titlesearch.search(page) - if results: - matches.extend(results) - - # Search in page body - body = page.get_raw_body() - for match in self.search_re.finditer(body): - if page.request.cfg.xapian_stemming: - # somewhere in regular word - if body[match.start()] not in config.chars_upper and \ - body[match.start()-1] in config.chars_lower: - continue - - post = 0 - for c in body[match.end():]: - if c in config.chars_lower: - post += 1 - else: - break - - matches.append(TextMatch(start=match.start(), - end=match.end()+post)) - else: - matches.append(TextMatch(re_match=match)) - - # Decide what to do with the results. - if ((self.negated and matches) or - (not self.negated and not matches)): - return None - elif matches: - return matches - else: - # XXX why not return None or empty list? - return [Match()] - - def xapian_wanted(self): - return not self.use_re - - def xapian_term(self, request): - if self.use_re: - return None # xapian can't do regex search - else: - analyzer = Xapian.WikiAnalyzer(request=request, - language=request.cfg.language_default) - terms = self._pattern.split() - - # all parsed wikiwords, AND'ed - queries = [] - stemmed = [] - for t in terms: - if request.cfg.xapian_stemming: - # stemmed OR not stemmed - tmp = [] - for i in analyzer.tokenize(t, flat_stemming=False): - tmp.append(UnicodeQuery(Query.OP_OR, i)) - stemmed.append(i[1]) - t = tmp - else: - # just not stemmed - t = [UnicodeQuery(i) for i in analyzer.tokenize(t)] - queries.append(Query(Query.OP_AND, t)) - - if stemmed: - self._build_re(' '.join(stemmed), use_re=False, - case=self.case, stemmed=True) - - # titlesearch OR parsed wikiwords - return Query(Query.OP_OR, - (self.titlesearch.xapian_term(request), - Query(Query.OP_AND, queries))) - - -class TitleSearch(BaseExpression): - """ Term searches in pattern in page title only """ - - def __init__(self, pattern, use_re=False, case=False): - """ Init a title search - - @param pattern: pattern to search for, ascii string or unicode - @param use_re: treat pattern as re of plain text, bool - @param case: do case sensitive search, bool - """ - self._pattern = unicode(pattern) - self.negated = 0 - self.use_re = use_re - self.case = case - self._build_re(self._pattern, use_re=use_re, case=case) - - def costs(self): - return 100 - - def __unicode__(self): - neg = self.negated and '-' or '' - return u'%s!"%s"' % (neg, unicode(self._pattern)) - - def highlight_re(self): - return u"(%s)" % self._pattern - - def pageFilter(self): - """ Page filter function for single title search """ - def filter(name): - match = self.search_re.search(name) - if ((self.negated and match) or - (not self.negated and not match)): - return False - return True - return filter - - def search(self, page): - # Get matches in page name - matches = [] - for match in self.search_re.finditer(page.page_name): - if page.request.cfg.xapian_stemming: - # somewhere in regular word - if page.page_name[match.start()] not in config.chars_upper and \ - page.page_name[match.start()-1] in config.chars_lower: - continue - - post = 0 - for c in page.page_name[match.end():]: - if c in config.chars_lower: - post += 1 - else: - break - - matches.append(TitleMatch(start=match.start(), - end=match.end()+post)) - else: - matches.append(TitleMatch(re_match=match)) - - if ((self.negated and matches) or - (not self.negated and not matches)): - return None - elif matches: - return matches - else: - # XXX why not return None or empty list? - return [Match()] - - def xapian_wanted(self): - return not self.use_re - - def xapian_term(self, request): - if self.use_re: - return None # xapian doesn't support regex search - else: - analyzer = Xapian.WikiAnalyzer(request=request, - language=request.cfg.language_default) - terms = self._pattern.split() - terms = [list(analyzer.raw_tokenize(t)) for t in terms] - - # all parsed wikiwords, AND'ed - queries = [] - stemmed = [] - for t in terms: - if request.cfg.xapian_stemming: - # stemmed OR not stemmed - tmp = [] - for i in analyzer.tokenize(t, flat_stemming=False): - tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' % - (Xapian.Index.prefixMap['title'], j) for j in i])) - stemmed.append(i[1]) - t = tmp - else: - # just not stemmed - t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i)) - for i in analyzer.tokenize(t)] - - queries.append(Query(Query.OP_AND, t)) - - if stemmed: - self._build_re(' '.join(stemmed), use_re=False, - case=self.case, stemmed=True) - - return Query(Query.OP_AND, queries) - - -class LinkSearch(BaseExpression): - """ Search the term in the pagelinks """ - - def __init__(self, pattern, use_re=False, case=True): - """ Init a link search - - @param pattern: pattern to search for, ascii string or unicode - @param use_re: treat pattern as re of plain text, bool - @param case: do case sensitive search, bool - """ - # used for search in links - self._pattern = pattern - # used for search in text - self._textpattern = '(' + self._pattern.replace('/', '|') + ')' - self.negated = 0 - self.use_re = use_re - self.case = case - self.textsearch = TextSearch(self._textpattern, use_re=1, case=case) - self._build_re(unicode(pattern), use_re=use_re, case=case) - - def _build_re(self, pattern, use_re=False, case=False): - """ Make a regular expression out of a text pattern """ - flags = case and re.U or (re.I | re.U) - try: - if not use_re: - raise re.error - self.search_re = re.compile(pattern, flags) - self.static = False - except re.error: - self.pattern = pattern - self.static = True - - def costs(self): - return 5000 # cheaper than a TextSearch - - def __unicode__(self): - neg = self.negated and '-' or '' - return u'%s!"%s"' % (neg, unicode(self._pattern)) - - def highlight_re(self): - return u"(%s)" % self._textpattern - - def search(self, page): - # Get matches in page name - matches = [] - - Found = True - - for link in page.getPageLinks(page.request): - if ((self.static and self.pattern == link) or - (not self.static and self.search_re.match(link))): - break - else: - Found = False - - if Found: - # Search in page text - results = self.textsearch.search(page) - if results: - matches.extend(results) - else: #This happens e.g. for pages that use navigation macros - matches.append(TextMatch(0, 0)) - - # Decide what to do with the results. - if ((self.negated and matches) or - (not self.negated and not matches)): - return None - elif matches: - return matches - else: - # XXX why not return None or empty list? - return [Match()] - - def xapian_wanted(self): - return not self.use_re - - def xapian_term(self, request): - pattern = self.pattern - if self.use_re: - return None # xapian doesnt support regex search - else: - return UnicodeQuery('%s:%s' % - (Xapian.Index.prefixMap['linkto'], pattern)) - - -class LanguageSearch(BaseExpression): - """ Search the pages written in a language """ - - def __init__(self, pattern, use_re=False, case=True): - """ Init a language search - - @param pattern: pattern to search for, ascii string or unicode - @param use_re: treat pattern as re of plain text, bool - @param case: do case sensitive search, bool - """ - # iso language code, always lowercase - self._pattern = pattern.lower() - self.negated = 0 - self.use_re = use_re - self.case = case - self.xapian_called = False - self._build_re(self._pattern, use_re=use_re, case=case) - - def costs(self): - return 5000 # cheaper than a TextSearch - - def __unicode__(self): - neg = self.negated and '-' or '' - return u'%s!"%s"' % (neg, unicode(self._pattern)) - - def highlight_re(self): - return "" - - def search(self, page): - # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch - if not self.xapian_called: - return None - else: - # XXX why not return None or empty list? - return [Match()] - - def xapian_wanted(self): - return not self.use_re - - def xapian_term(self, request): - pattern = self.pattern - if self.use_re: - return None # xapian doesnt support regex search - else: - self.xapian_called = True - return UnicodeQuery('%s%s' % - (Xapian.Index.prefixMap['lang'], pattern)) - - -############################################################################ -### Results -############################################################################ - -class Match(object): - """ Base class for all Matches (found pieces of pages). - - This class represents a empty True value as returned from negated searches. - """ - # Default match weight - _weight = 1.0 - - def __init__(self, start=0, end=0, re_match=None): - self.re_match = re_match - if not re_match: - self._start = start - self._end = end - else: - self._start = self._end = 0 - - def __len__(self): - return self.end - self.start - - def __eq__(self, other): - equal = (self.__class__ == other.__class__ and - self.start == other.start and - self.end == other.end) - return equal - - def __ne__(self, other): - return not self.__eq__(other) - - def view(self): - return '' - - def weight(self): - return self._weight - - def _get_start(self): - if self.re_match: - return self.re_match.start() - return self._start - - def _get_end(self): - if self.re_match: - return self.re_match.end() - return self._end - - # object properties - start = property(_get_start) - end = property(_get_end) - - -class TextMatch(Match): - """ Represents a match in the page content """ - pass - - -class TitleMatch(Match): - """ Represents a match in the page title - - Has more weight as a match in the page content. - """ - # Matches in titles are much more important in wikis. This setting - # seems to make all pages that have matches in the title to appear - # before pages that their title does not match. - _weight = 100.0 - - -class AttachmentMatch(Match): - """ Represents a match in a attachment content - - Not used yet. - """ - pass - - -class FoundPage: - """ Represents a page in a search result """ - - def __init__(self, page_name, matches=None, page=None): - self.page_name = page_name - self.attachment = '' # this is not an attachment - self.page = page - if matches is None: - matches = [] - self._matches = matches - - def weight(self, unique=1): - """ returns how important this page is for the terms searched for - - Summarize the weight of all page matches - - @param unique: ignore identical matches - @rtype: int - @return: page weight - """ - weight = 0 - for match in self.get_matches(unique=unique): - weight += match.weight() - # More sophisticated things to be added, like increase - # weight of near matches. - return weight - - def add_matches(self, matches): - """ Add found matches """ - self._matches.extend(matches) - - def get_matches(self, unique=1, sort='start', type=Match): - """ Return all matches of type sorted by sort - - @param unique: return only unique matches (bool) - @param sort: match attribute to sort by (string) - @param type: type of match to return (Match or sub class) - @rtype: list - @return: list of matches - """ - if unique: - matches = self._unique_matches(type=type) - if sort == 'start': - # matches already sorted by match.start, finished. - return matches - else: - matches = self._matches - - # Filter by type and sort by sort using fast schwartzian - # transform. - if sort == 'start': - tmp = [(match.start, match) for match in matches - if instance(match, type)] - else: - tmp = [(match.weight(), match) for match in matches - if instance(match, type)] - tmp.sort() - if sort == 'weight': - tmp.reverse() - matches = [item[1] for item in tmp] - - return matches - - def _unique_matches(self, type=Match): - """ Get a list of unique matches of type - - The result is sorted by match.start, because its easy to remove - duplicates like this. - - @param type: type of match to return - @rtype: list - @return: list of matches of type, sorted by match.start - """ - # Filter by type and sort by match.start using fast schwartzian - # transform. - tmp = [(match.start, match) for match in self._matches - if isinstance(match, type)] - tmp.sort() - - if not len(tmp): - return [] - - # Get first match into matches list - matches = [tmp[0][1]] - - # Add the remaining ones of matches ignoring identical matches - for item in tmp[1:]: - if item[1] == matches[-1]: - continue - matches.append(item[1]) - - return matches - - -class FoundAttachment(FoundPage): - """ Represent an attachment in search results """ - - def __init__(self, page_name, attachment, matches=None, page=None): - self.page_name = page_name - self.attachment = attachment - self.page = page - if matches is None: - matches = [] - self._matches = matches - - def weight(self, unique=1): - return 1 - - def get_matches(self, unique=1, sort='start', type=Match): - return [] - - def _unique_matches(self, type=Match): - return [] - - -class FoundRemote(FoundPage): - """ Represent an attachment in search results """ - - def __init__(self, wikiname, page_name, attachment, matches=None, page=None): - self.wikiname = wikiname - self.page_name = page_name - self.attachment = attachment - self.page = page - if matches is None: - matches = [] - self._matches = matches - - def weight(self, unique=1): - return 1 - - def get_matches(self, unique=1, sort='start', type=Match): - return [] - - def _unique_matches(self, type=Match): - return [] - - -############################################################################## -### Parse Query -############################################################################## - - -class QueryParser: - """ - Converts a String into a tree of Query objects - using recursive top/down parsing - """ - - def __init__(self, **kw): - """ - @keyword titlesearch: treat all terms as title searches - @keyword case: do case sensitive search - @keyword regex: treat all terms as regular expressions - """ - self.titlesearch = kw.get('titlesearch', 0) - self.case = kw.get('case', 0) - self.regex = kw.get('regex', 0) - - def parse_query(self, query): - """ transform an string into a tree of Query objects """ - if isinstance(query, str): - query = query.decode(config.charset) - self._query = query - result = self._or_expression() - if result is None: - result = BaseExpression() - return result - - def _or_expression(self): - result = self._and_expression() - if self._query: - result = OrExpression(result) - while self._query: - q = self._and_expression() - if q: - result.append(q) - return result - - def _and_expression(self): - result = None - while not result and self._query: - result = self._single_term() - term = self._single_term() - if term: - result = AndExpression(result, term) - else: - return result - term = self._single_term() - while term: - result.append(term) - term = self._single_term() - return result - - def _single_term(self): - regex = (r'(?P-?)\s*(' + # leading '-' - r'(?P\(|\)|(or\b(?!$)))|' + # or, (, ) - r'(?P(\w+:)*)' + - r'(?P("[^"]+")|' + - r"('[^']+')|(\S+)))") # search word itself - self._query = self._query.strip() - match = re.match(regex, self._query, re.U) - if not match: - return None - self._query = self._query[match.end():] - ops = match.group("OPS") - if ops == '(': - result = self._or_expression() - if match.group("NEG"): result.negate() - return result - elif ops == ')': - return None - elif ops == 'or': - return None - modifiers = match.group('MOD').split(":")[:-1] - text = match.group('TERM') - if self.isQuoted(text): - text = text[1:-1] - - title_search = self.titlesearch - regex = self.regex - case = self.case - linkto = False - lang = False - - for m in modifiers: - if "title".startswith(m): - title_search = True - elif "regex".startswith(m): - regex = True - elif "case".startswith(m): - case = True - elif "linkto".startswith(m): - linkto = True - elif "language".startswith(m): - lang = True - - if lang: - obj = LanguageSearch(text, use_re=regex, case=False) - elif linkto: - obj = LinkSearch(text, use_re=regex, case=case) - elif title_search: - obj = TitleSearch(text, use_re=regex, case=case) - else: - obj = TextSearch(text, use_re=regex, case=case) - - if match.group("NEG"): - obj.negate() - return obj - - def isQuoted(self, text): - # Empty string '' is not considered quoted - if len(text) < 3: - return False - return (text.startswith('"') and text.endswith('"') or - text.startswith("'") and text.endswith("'")) - - -############################################################################ -### Search results formatting -############################################################################ - -class SearchResults: - """ Manage search results, supply different views - - Search results can hold valid search results and format them for - many requests, until the wiki content changes. - - For example, one might ask for full page list sorted from A to Z, - and then ask for the same list sorted from Z to A. Or sort results - by name and then by rank. - """ - # Public functions -------------------------------------------------- - - def __init__(self, query, hits, pages, elapsed): - self.query = query # the query - self.hits = hits # hits list - self.sort = None # hits are unsorted initially - self.pages = pages # number of pages in the wiki - self.elapsed = elapsed # search time - - def sortByWeight(self): - """ Sorts found pages by the weight of the matches """ - tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits] - tmp.sort() - tmp.reverse() - self.hits = [item[2] for item in tmp] - self.sort = 'weight' - - def sortByPagename(self): - """ Sorts a list of found pages alphabetical by page name """ - tmp = [(hit.page_name, hit) for hit in self.hits] - tmp.sort() - self.hits = [item[1] for item in tmp] - self.sort = 'page_name' - - def stats(self, request, formatter): - """ Return search statistics, formatted with formatter - - @param request: current request - @param formatter: formatter to use - @rtype: unicode - @return formatted statistics - """ - _ = request.getText - output = [ - formatter.paragraph(1), - formatter.text(_("%(hits)d results out of about %(pages)d pages.") % - {'hits': len(self.hits), 'pages': self.pages}), - u' (%s)' % formatter.text(_("%.2f seconds") % self.elapsed), - formatter.paragraph(0), - ] - return ''.join(output) - - def pageList(self, request, formatter, info=0, numbered=1): - """ Format a list of found pages - - @param request: current request - @param formatter: formatter to use - @param info: show match info in title - @param numbered: use numbered list for display - @rtype: unicode - @return formatted page list - """ - self._reset(request, formatter) - f = formatter - write = self.buffer.write - if numbered: - list = f.number_list - else: - list = f.bullet_list - - # Add pages formatted as list - if self.hits: - write(list(1)) - - for page in self.hits: - if page.attachment: - querydict = { - 'action': 'AttachFile', - 'do': 'get', - 'target': page.attachment, - } - else: - querydict = None - querystr = self.querystring(querydict) - - matchInfo = '' - if info: - matchInfo = self.formatInfo(f, page) - item = [ - f.listitem(1), - f.pagelink(1, page.page_name, querystr=querystr), - self.formatTitle(page), - f.pagelink(0, page.page_name), - matchInfo, - f.listitem(0), - ] - write(''.join(item)) - write(list(0)) - - return self.getvalue() - - def pageListWithContext(self, request, formatter, info=1, context=180, - maxlines=1): - """ Format a list of found pages with context - - The default parameter values will create Google-like search - results, as this is the most known search interface. Good - interface is familiar interface, so unless we have much better - solution (we don't), being like Google is the way. - - @param request: current request - @param formatter: formatter to use - @param info: show match info near the page link - @param context: how many characters to show around each match. - @param maxlines: how many contexts lines to show. - @rtype: unicode - @return formatted page list with context - """ - self._reset(request, formatter) - f = formatter - write = self.buffer.write - - # Add pages formatted as definition list - if self.hits: - write(f.definition_list(1)) - - for page in self.hits: - matchInfo = '' - if info: - matchInfo = self.formatInfo(f, page) - if page.attachment: - fmt_context = "" - querydict = { - 'action': 'AttachFile', - 'do': 'get', - 'target': page.attachment, - } - elif page.page_name.startswith('FS/'): # XXX FS hardcoded - fmt_context = "" - querydict = None - else: - fmt_context = self.formatContext(page, context, maxlines) - querydict = None - querystr = self.querystring(querydict) - item = [ - f.definition_term(1), - f.pagelink(1, page.page_name, querystr=querystr), - self.formatTitle(page), - f.pagelink(0, page.page_name), - matchInfo, - f.definition_term(0), - f.definition_desc(1), - fmt_context, - f.definition_desc(0), - ] - write(''.join(item)) - write(f.definition_list(0)) - - return self.getvalue() - - # Private ----------------------------------------------------------- - - # This methods are not meant to be used by clients and may change - # without notice. - - def formatContext(self, page, context, maxlines): - """ Format search context for each matched page - - Try to show first maxlines interesting matches context. - """ - f = self.formatter - if not page.page: - page.page = Page(self.request, page.page_name) - body = page.page.get_raw_body() - last = len(body) - 1 - lineCount = 0 - output = [] - - # Get unique text matches sorted by match.start, try to ignore - # matches in page header, and show the first maxlines matches. - # TODO: when we implement weight algorithm for text matches, we - # should get the list of text matches sorted by weight and show - # the first maxlines matches. - matches = page.get_matches(unique=1, sort='start', type=TextMatch) - i, start = self.firstInterestingMatch(page, matches) - - # Format context - while i < len(matches) and lineCount < maxlines: - match = matches[i] - - # Get context range for this match - start, end = self.contextRange(context, match, start, last) - - # Format context lines for matches. Each complete match in - # the context will be highlighted, and if the full match is - # in the context, we increase the index, and will not show - # same match again on a separate line. - - output.append(f.text(u'...')) - - # Get the index of the first match completely within the - # context. - for j in xrange(0, len(matches)): - if matches[j].start >= start: - break - - # Add all matches in context and the text between them - while True: - match = matches[j] - # Ignore matches behind the current position - if start < match.end: - # Append the text before match - if start < match.start: - output.append(f.text(body[start:match.start])) - # And the match - output.append(self.formatMatch(body, match, start)) - start = match.end - # Get next match, but only if its completely within the context - if j < len(matches) - 1 and matches[j + 1].end <= end: - j += 1 - else: - break - - # Add text after last match and finish the line - if match.end < end: - output.append(f.text(body[match.end:end])) - output.append(f.text(u'...')) - output.append(f.linebreak(preformatted=0)) - - # Increase line and point to the next match - lineCount += 1 - i = j + 1 - - output = ''.join(output) - - if not output: - # Return the first context characters from the page text - output = f.text(page.page.getPageText(length=context)) - output = output.strip() - if not output: - # This is a page with no text, only header, for example, - # a redirect page. - output = f.text(page.page.getPageHeader(length=context)) - - return output - - def firstInterestingMatch(self, page, matches): - """ Return the first interesting match - - This function is needed only because we don't have yet a weight - algorithm for page text matches. - - Try to find the first match in the page text. If we can't find - one, we return the first match and start=0. - - @rtype: tuple - @return: index of first match, start of text - """ - header = page.page.getPageHeader() - start = len(header) - # Find first match after start - for i in xrange(len(matches)): - if matches[i].start >= start: - return i, start - return 0, 0 - - def contextRange(self, context, match, start, last): - """ Compute context range - - Add context around each match. If there is no room for context - before or after the match, show more context on the other side. - - @param context: context length - @param match: current match - @param start: context should not start before that index, unless - end is past the last character. - @param last: last character index - @rtype: tuple - @return: start, end of context - """ - # Start by giving equal context on both sides of match - contextlen = max(context - len(match), 0) - cstart = match.start - contextlen / 2 - cend = match.end + contextlen / 2 - - # If context start before start, give more context on end - if cstart < start: - cend += start - cstart - cstart = start - - # But if end if after last, give back context to start - if cend > last: - cstart -= cend - last - cend = last - - # Keep context start positive for very short texts - cstart = max(cstart, 0) - - return cstart, cend - - def formatTitle(self, page): - """ Format page title - - Invoke format match on all unique matches in page title. - - @param page: found page - @rtype: unicode - @return: formatted title - """ - # Get unique title matches sorted by match.start - matches = page.get_matches(unique=1, sort='start', type=TitleMatch) - - # Format - pagename = page.page_name - f = self.formatter - output = [] - start = 0 - for match in matches: - # Ignore matches behind the current position - if start < match.end: - # Append the text before the match - if start < match.start: - output.append(f.text(pagename[start:match.start])) - # And the match - output.append(self.formatMatch(pagename, match, start)) - start = match.end - # Add text after match - if start < len(pagename): - output.append(f.text(pagename[start:])) - - if page.attachment: # show the attachment that matched - output.extend([ - " ", - f.strong(1), - f.text("(%s)" % page.attachment), - f.strong(0)]) - - return ''.join(output) - - def formatMatch(self, body, match, location): - """ Format single match in text - - Format the part of the match after the current location in the - text. Matches behind location are ignored and an empty string is - returned. - - @param body: text containing match - @param match: search match in text - @param location: current location in text - @rtype: unicode - @return: formatted match or empty string - """ - start = max(location, match.start) - if start < match.end: - f = self.formatter - output = [ - f.strong(1), - f.text(body[start:match.end]), - f.strong(0), - ] - return ''.join(output) - return '' - - def querystring(self, querydict=None): - """ Return query string, used in the page link """ - if querydict is None: - querydict = {'highlight': self.query.highlight_re()} - querystr = wikiutil.makeQueryString(querydict) - #querystr = wikiutil.escape(querystr) - return querystr - - def formatInfo(self, formatter, page): - """ Return formatted match info """ - template = u' . . . %s %s' - template = u"%s%s%s" % (formatter.span(1, css_class="info"), - template, - formatter.span(0)) - # Count number of unique matches in text of all types - count = len(page.get_matches(unique=1)) - info = template % (count, self.matchLabel[count != 1]) - return info - - def getvalue(self): - """ Return output in div with CSS class """ - write = self.request.write - value = [ - self.formatter.div(1, css_class='searchresults'), - self.buffer.getvalue(), - self.formatter.div(0), - ] - return '\n'.join(value) - - def _reset(self, request, formatter): - """ Update internal state before new output - - Do not call this, it should be called only by the instance code. - - Each request might need different translations or other user - preferences. - """ - self.buffer = StringIO.StringIO() - self.formatter = formatter - self.request = request - # Use 1 match, 2 matches... - _ = request.getText - self.matchLabel = (_('match'), _('matches')) - - -############################################################################## -### Searching -############################################################################## - -class Search: - """ A search run """ - - def __init__(self, request, query): - self.request = request - self.query = query - self.filtered = False - self.fs_rootpage = "FS" # XXX FS hardcoded - - def run(self): - """ Perform search and return results object """ - start = time.time() - if self.request.cfg.xapian_search: - hits = self._xapianSearch() - else: - hits = self._moinSearch() - - # important - filter deleted pages or pages the user may not read! - if not self.filtered: - hits = self._filter(hits) - - result_hits = [] - for wikiname, page, attachment, match in hits: - if wikiname in (self.request.cfg.interwikiname, 'Self'): # a local match - if attachment: - result_hits.append(FoundAttachment(page.page_name, attachment)) - else: - result_hits.append(FoundPage(page.page_name, match)) - else: - result_hits.append(FoundRemote(wikiname, page, attachment, match)) - elapsed = time.time() - start - count = self.request.rootpage.getPageCount() - return SearchResults(self.query, result_hits, count, elapsed) - - # ---------------------------------------------------------------- - # Private! - - def _xapianSearch(self): - """ Search using Xapian - - Get a list of pages using fast xapian search and - return moin search in those pages. - """ - pages = None - try: - index = Xapian.Index(self.request) - except NameError: - index = None - if index and index.exists() and self.query.xapian_wanted(): - self.request.clock.start('_xapianSearch') - try: - from MoinMoin.support import xapwrap - query = self.query.xapian_term(self.request) - self.request.log("xapianSearch: query = %r" % - query.get_description()) - query = xapwrap.index.QObjQuery(query) - hits = index.search(query) - self.request.log("xapianSearch: finds: %r" % hits) - def dict_decode(d): - """ decode dict values to unicode """ - for k, v in d.items(): - d[k] = d[k].decode(config.charset) - return d - pages = [dict_decode(hit['values']) for hit in hits] - self.request.log("xapianSearch: finds pages: %r" % pages) - except index.LockedException: - pass - self.request.clock.stop('_xapianSearch') - return self._moinSearch(pages) - - def _moinSearch(self, pages=None): - """ Search pages using moin's built-in full text search - - Return list of tuples (page, match). The list may contain - deleted pages or pages the user may not read. - """ - self.request.clock.start('_moinSearch') - from MoinMoin.Page import Page - if pages is None: - # if we are not called from _xapianSearch, we make a full pagelist, - # but don't search attachments (thus attachment name = '') - pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()] - hits = [] - fs_rootpage = self.fs_rootpage - for valuedict in pages: - wikiname = valuedict['wikiname'] - pagename = valuedict['pagename'] - attachment = valuedict['attachment'] - if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki - page = Page(self.request, pagename) - if attachment: - if pagename == fs_rootpage: # not really an attachment - page = Page(self.request, "%s/%s" % (fs_rootpage, attachment)) - hits.append((wikiname, page, None, None)) - else: - hits.append((wikiname, page, attachment, None)) - else: - match = self.query.search(page) - if match: - hits.append((wikiname, page, attachment, match)) - else: # other wiki - hits.append((wikiname, pagename, attachment, None)) - self.request.clock.stop('_moinSearch') - return hits - - def _getPageList(self): - """ Get list of pages to search in - - If the query has a page filter, use it to filter pages before - searching. If not, get a unfiltered page list. The filtering - will happen later on the hits, which is faster with current - slow storage. - """ - filter = self.query.pageFilter() - if filter: - # There is no need to filter the results again. - self.filtered = True - return self.request.rootpage.getPageList(filter=filter) - else: - return self.request.rootpage.getPageList(user='', exists=0) - - def _filter(self, hits): - """ Filter out deleted or acl protected pages """ - userMayRead = self.request.user.may.read - fs_rootpage = self.fs_rootpage + "/" - thiswiki = (self.request.cfg.interwikiname, 'Self') - filtered = [(wikiname, page, attachment, match) for wikiname, page, attachment, match in hits - if not wikiname in thiswiki or - page.exists() and userMayRead(page.page_name) or - page.page_name.startswith(fs_rootpage)] - return filtered - - -def searchPages(request, query, **kw): - """ Search the text of all pages for query. - - @param request: current request - @param query: the expression we want to search for - @rtype: SearchResults instance - @return: search results - """ - return Search(request, query).run() - # HG changeset patch # User Franz Pletz # Date 1151414408 -7200 # Node ID a2498260eca5a77f878a9a5596281560e1dc5bef # Parent 5469c8b911a4f261c579ddf05a96c590b4ae55ef do result processing in results.py diff -r 5469c8b911a4 -r a2498260eca5 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Tue Jun 27 15:09:46 2006 +0200 +++ b/MoinMoin/search/builtin.py Tue Jun 27 15:20:08 2006 +0200 @@ -13,7 +13,7 @@ import time, sys import time, sys from MoinMoin import wikiutil, config from MoinMoin.Page import Page -from MoinMoin.search.results import FoundRemote, FoundPage, FoundAttachment, SearchResults +from MoinMoin.search.results import getSearchResults try: from MoinMoin.search import Xapian @@ -45,19 +45,9 @@ class Search: # important - filter deleted pages or pages the user may not read! if not self.filtered: hits = self._filter(hits) + + return getSearchResults(self.request, self.query, hits, start) - result_hits = [] - for wikiname, page, attachment, match in hits: - if wikiname in (self.request.cfg.interwikiname, 'Self'): # a local match - if attachment: - result_hits.append(FoundAttachment(page.page_name, attachment)) - else: - result_hits.append(FoundPage(page.page_name, match)) - else: - result_hits.append(FoundRemote(wikiname, page, attachment, match)) - elapsed = time.time() - start - count = self.request.rootpage.getPageCount() - return SearchResults(self.query, result_hits, count, elapsed) # ---------------------------------------------------------------- # Private! diff -r 5469c8b911a4 -r a2498260eca5 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Jun 27 15:09:46 2006 +0200 +++ b/MoinMoin/search/results.py Tue Jun 27 15:20:08 2006 +0200 @@ -10,7 +10,7 @@ @license: GNU GPL, see COPYING for details """ -import StringIO +import StringIO, time from MoinMoin import config, wikiutil from MoinMoin.Page import Page @@ -640,3 +640,17 @@ class SearchResults: self.matchLabel = (_('match'), _('matches')) +def getSearchResults(request, query, hits, start): + result_hits = [] + for wikiname, page, attachment, match in hits: + if wikiname in (request.cfg.interwikiname, 'Self'): # a local match + if attachment: + result_hits.append(FoundAttachment(page.page_name, attachment)) + else: + result_hits.append(FoundPage(page.page_name, match)) + else: + result_hits.append(FoundRemote(wikiname, page, attachment, match)) + elapsed = time.time() - start + count = request.rootpage.getPageCount() + return SearchResults(query, result_hits, count, elapsed) + # HG changeset patch # User Franz Pletz # Date 1151530906 -7200 # Node ID 45e2861838727ea36e73d304bedf374678a6a757 # Parent a2498260eca5a77f878a9a5596281560e1dc5bef abstraction work on search engine index & cleanups diff -r a2498260eca5 -r 45e286183872 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Tue Jun 27 15:20:08 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Wed Jun 28 23:41:46 2006 +0200 @@ -56,9 +56,10 @@ def execute(pagename, request, fieldname return # search the pages - from MoinMoin import search - results = search.searchPages(request, needle, case=case, - regex=regex, titlesearch=titlesearch) + from MoinMoin.search import searchPages, QueryParser + query = QueryParser(case=case, regex=regex, + titlesearch=titlesearch).parse_query(needle) + results = searchPages(request, query) # directly show a single hit # XXX won't work with attachment search diff -r a2498260eca5 -r 45e286183872 MoinMoin/script/index/build.py --- a/MoinMoin/script/index/build.py Tue Jun 27 15:20:08 2006 +0200 +++ b/MoinMoin/script/index/build.py Wed Jun 28 23:41:46 2006 +0200 @@ -38,7 +38,7 @@ class PluginScript(IndexScript): """ Xapian index build script class """ def command(self): - from MoinMoin.Xapian import Index + from MoinMoin.search.Xapian import Index Index(self.request).indexPages(self.files, self.options.mode) #Index(self.request).test(self.request) diff -r a2498260eca5 -r 45e286183872 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Tue Jun 27 15:20:08 2006 +0200 +++ b/MoinMoin/search/Xapian.py Wed Jun 28 23:41:46 2006 +0200 @@ -8,7 +8,7 @@ """ debug = True -import sys, os, re, codecs, errno, time +import sys, os, re, codecs, time from pprint import pprint import xapian @@ -19,7 +19,7 @@ from MoinMoin.parser.text_moin_wiki impo from MoinMoin.Page import Page from MoinMoin import config, wikiutil -from MoinMoin.util import filesys, lock +from MoinMoin.search.builtin import BaseIndex try: # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ @@ -130,122 +130,7 @@ class WikiAnalyzer: ### Indexing ############################################################################# -class UpdateQueue: - def __init__(self, file, lock_dir): - self.file = file - self.writeLock = lock.WriteLock(lock_dir, timeout=10.0) - self.readLock = lock.ReadLock(lock_dir, timeout=10.0) - - def exists(self): - return os.path.exists(self.file) - - def append(self, pagename): - """ Append a page to queue """ - if not self.writeLock.acquire(60.0): - request.log("can't add %r to xapian update queue: can't lock queue" % - pagename) - return - try: - f = codecs.open(self.file, 'a', config.charset) - try: - f.write(pagename + "\n") - finally: - f.close() - finally: - self.writeLock.release() - - def pages(self): - """ Return list of pages in the queue """ - if self.readLock.acquire(1.0): - try: - return self._decode(self._read()) - finally: - self.readLock.release() - return [] - - def remove(self, pages): - """ Remove pages from the queue - - When the queue is empty, the queue file is removed, so exists() - can tell if there is something waiting in the queue. - """ - if self.writeLock.acquire(30.0): - try: - queue = self._decode(self._read()) - for page in pages: - try: - queue.remove(page) - except ValueError: - pass - if queue: - self._write(queue) - else: - self._removeFile() - return True - finally: - self.writeLock.release() - return False - - # Private ------------------------------------------------------- - - def _decode(self, data): - """ Decode queue data """ - pages = data.splitlines() - return self._filterDuplicates(pages) - - def _filterDuplicates(self, pages): - """ Filter duplicates in page list, keeping the order """ - unique = [] - seen = {} - for name in pages: - if not name in seen: - unique.append(name) - seen[name] = 1 - return unique - - def _read(self): - """ Read and return queue data - - This does not do anything with the data so we can release the - lock as soon as possible, enabling others to update the queue. - """ - try: - f = codecs.open(self.file, 'r', config.charset) - try: - return f.read() - finally: - f.close() - except (OSError, IOError), err: - if err.errno != errno.ENOENT: - raise - return '' - - def _write(self, pages): - """ Write pages to queue file - - Requires queue write locking. - """ - # XXX use tmpfile/move for atomic replace on real operating systems - data = '\n'.join(pages) + '\n' - f = codecs.open(self.file, 'w', config.charset) - try: - f.write(data) - finally: - f.close() - - def _removeFile(self): - """ Remove queue file - - Requires queue write locking. - """ - try: - os.remove(self.file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - -class Index: +class Index(BaseIndex): indexValueMap = { # mapping the value names we can easily fetch from the index to # integers required by xapian. 0 and 1 are reserved by xapwrap! @@ -280,27 +165,8 @@ class Index: #Y year (four digits) } - class LockedException(Exception): - pass - def __init__(self, request): - self.request = request - cache_dir = request.cfg.cache_dir - main_dir = self._main_dir() - self.dir = os.path.join(main_dir, 'index') - filesys.makeDirs(self.dir) - self.sig_file = os.path.join(main_dir, 'complete') - lock_dir = os.path.join(main_dir, 'index-lock') - self.lock = lock.WriteLock(lock_dir, - timeout=3600.0, readlocktimeout=60.0) - self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) - self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), - os.path.join(main_dir, 'update-queue-lock')) - - # Disabled until we have a sane way to build the index with a - # queue in small steps. - ## if not self.exists(): - ## self.indexPagesInNewThread(request) + BaseIndex.__init__(self, request) # Check if we should and can stem words if request.cfg.xapian_stemming and not Stemmer: @@ -312,13 +178,6 @@ class Index: self.request.cfg.siteid) else: return os.path.join(self.request.cfg.cache_dir, 'xapian') - - def exists(self): - """ Check if index exists """ - return os.path.exists(self.sig_file) - - def mtime(self): - return os.path.getmtime(self.dir) def _search(self, query): """ read lock must be acquired """ @@ -339,142 +198,24 @@ class Index: self.request.cfg.xapian_searchers.append((searcher, timestamp)) return hits - def search(self, query): - if not self.read_lock.acquire(1.0): - raise self.LockedException - try: - hits = self._search(query) - finally: - self.read_lock.release() - return hits - - def update_page(self, page): - self.queue.append(page.page_name) - self._do_queued_updates_InNewThread() - - def indexPages(self, files=None, mode='update'): - """ Index all pages (and files, if given) - - Can be called only from a script. To index pages during a user - request, use indexPagesInNewThread. - @arg files: iterator or list of files to index additionally - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - request = self._indexingRequest(self.request) - self._index_pages(request, None, files, mode) - finally: - self.lock.release() - - def indexPagesInNewThread(self, files=None, mode='update'): - """ Index all pages in a new thread - - Should be called from a user request. From a script, use indexPages. - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - # Prevent rebuilding the index just after it was finished - if self.exists(): - self.lock.release() - return - from threading import Thread - indexThread = Thread(target=self._index_pages, - args=(self._indexingRequest(self.request), self.lock, files, mode)) - indexThread.setDaemon(True) - - # Join the index thread after current request finish, prevent - # Apache CGI from killing the process. - def joinDecorator(finish): - def func(): - finish() - indexThread.join() - return func - - self.request.finish = joinDecorator(self.request.finish) - indexThread.start() - except: - self.lock.release() - raise - - def optimize(self): - pass - - # Private ---------------------------------------------------------------- - - def _do_queued_updates_InNewThread(self): - """ do queued index updates in a new thread - - Should be called from a user request. From a script, use indexPages. - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - from threading import Thread - indexThread = Thread(target=self._do_queued_updates, - args=(self._indexingRequest(self.request), self.lock)) - indexThread.setDaemon(True) - - # Join the index thread after current request finish, prevent - # Apache CGI from killing the process. - def joinDecorator(finish): - def func(): - finish() - indexThread.join() - return func - - self.request.finish = joinDecorator(self.request.finish) - indexThread.start() - except: - self.lock.release() - raise - - def _do_queued_updates(self, request, lock=None, amount=5): + def _do_queued_updates(self, request, amount=5): """ Assumes that the write lock is acquired """ - try: - writer = xapidx.Index(self.dir, True) - writer.configure(self.prefixMap, self.indexValueMap) - pages = self.queue.pages()[:amount] - for name in pages: - p = Page(request, name) - self._index_page(writer, p, mode='update') - self.queue.remove([name]) - finally: - writer.close() - if lock: - lock.release() - - def contentfilter(self, filename): - """ Get a filter for content of filename and return unicode content. """ - request = self.request - mt = wikiutil.MimeType(filename=filename) - for modulename in mt.module_name(): - try: - execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) - break - except wikiutil.PluginMissingError: - pass - else: - request.log("Cannot load filter for mimetype." + modulename) - try: - data = execute(self, filename) - if debug: - request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename)) - except (OSError, IOError), err: - data = '' - request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename)) - return mt.mime_type(), data - - def test(self, request): - idx = xapidx.ReadOnlyIndex(self.dir) - idx.configure(self.prefixMap, self.indexValueMap) - print idx.search("is") - #for d in docs: - # request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) + writer = xapidx.Index(self.dir, True) + writer.configure(self.prefixMap, self.indexValueMap) + pages = self.queue.pages()[:amount] + for name in pages: + p = Page(request, name) + self._index_page(writer, p, mode='update') + self.queue.remove([name]) + writer.close() + + # XXX: why? + #def test(self, request): + # idx = xapidx.ReadOnlyIndex(self.dir) + # idx.configure(self.prefixMap, self.indexValueMap) + # print idx.search("is") + # #for d in docs: + # # request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) def _index_file(self, request, writer, filename, mode='update'): """ index a file as it were a page named pagename @@ -665,9 +406,8 @@ class Index: if debug: request.log("%s (add)" % (pagename,)) id = writer.index(doc) #writer.flush() - - - def _index_pages(self, request, lock=None, files=None, mode='update'): + + def _index_pages(self, request, files=None, mode='update'): """ Index all pages (and all given files) This should be called from indexPages or indexPagesInNewThread only! @@ -679,8 +419,6 @@ class Index: and this method must release it when it finishes or fails. """ try: - self._unsign() - start = time.time() writer = xapidx.Index(self.dir, True) writer.configure(self.prefixMap, self.indexValueMap) pages = request.rootpage.getPageList(user='', exists=1) @@ -694,50 +432,8 @@ class Index: fname = fname.strip() self._index_file(request, writer, fname, mode) writer.close() - request.log("indexing completed successfully in %0.2f seconds." % - (time.time() - start)) - self._sign() finally: writer.__del__() - if lock: - lock.release() - - def _optimize(self, request): - """ Optimize the index """ - pass - - def _indexingRequest(self, request): - """ Return a new request that can be used for index building. - - This request uses a security policy that lets the current user - read any page. Without this policy some pages will not render, - which will create broken pagelinks index. - """ - from MoinMoin.request.CLI import Request - from MoinMoin.security import Permissions - request = Request(request.url) - class SecurityPolicy(Permissions): - def read(*args, **kw): - return True - request.user.may = SecurityPolicy(request.user) - return request - - def _unsign(self): - """ Remove sig file - assume write lock acquired """ - try: - os.remove(self.sig_file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - def _sign(self): - """ Add sig file - assume write lock acquired """ - f = file(self.sig_file, 'w') - try: - f.write('') - finally: - f.close() - def run_query(query, db): enquire = xapian.Enquire(db) diff -r a2498260eca5 -r 45e286183872 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Tue Jun 27 15:20:08 2006 +0200 +++ b/MoinMoin/search/builtin.py Wed Jun 28 23:41:46 2006 +0200 @@ -10,16 +10,336 @@ @license: GNU GPL, see COPYING for details """ -import time, sys +import time, sys, os, errno from MoinMoin import wikiutil, config from MoinMoin.Page import Page +from MoinMoin.util import filesys, lock from MoinMoin.search.results import getSearchResults -try: - from MoinMoin.search import Xapian -except ImportError: - pass - +############################################################################## +# Search Engine Abstraction +############################################################################## + +class UpdateQueue: + def __init__(self, file, lock_dir): + self.file = file + self.writeLock = lock.WriteLock(lock_dir, timeout=10.0) + self.readLock = lock.ReadLock(lock_dir, timeout=10.0) + + def exists(self): + return os.path.exists(self.file) + + def append(self, pagename): + """ Append a page to queue """ + if not self.writeLock.acquire(60.0): + request.log("can't add %r to xapian update queue: can't lock queue" % + pagename) + return + try: + f = codecs.open(self.file, 'a', config.charset) + try: + f.write(pagename + "\n") + finally: + f.close() + finally: + self.writeLock.release() + + def pages(self): + """ Return list of pages in the queue """ + if self.readLock.acquire(1.0): + try: + return self._decode(self._read()) + finally: + self.readLock.release() + return [] + + def remove(self, pages): + """ Remove pages from the queue + + When the queue is empty, the queue file is removed, so exists() + can tell if there is something waiting in the queue. + """ + if self.writeLock.acquire(30.0): + try: + queue = self._decode(self._read()) + for page in pages: + try: + queue.remove(page) + except ValueError: + pass + if queue: + self._write(queue) + else: + self._removeFile() + return True + finally: + self.writeLock.release() + return False + + # Private ------------------------------------------------------- + + def _decode(self, data): + """ Decode queue data """ + pages = data.splitlines() + return self._filterDuplicates(pages) + + def _filterDuplicates(self, pages): + """ Filter duplicates in page list, keeping the order """ + unique = [] + seen = {} + for name in pages: + if not name in seen: + unique.append(name) + seen[name] = 1 + return unique + + def _read(self): + """ Read and return queue data + + This does not do anything with the data so we can release the + lock as soon as possible, enabling others to update the queue. + """ + try: + f = codecs.open(self.file, 'r', config.charset) + try: + return f.read() + finally: + f.close() + except (OSError, IOError), err: + if err.errno != errno.ENOENT: + raise + return '' + + def _write(self, pages): + """ Write pages to queue file + + Requires queue write locking. + """ + # XXX use tmpfile/move for atomic replace on real operating systems + data = '\n'.join(pages) + '\n' + f = codecs.open(self.file, 'w', config.charset) + try: + f.write(data) + finally: + f.close() + + def _removeFile(self): + """ Remove queue file + + Requires queue write locking. + """ + try: + os.remove(self.file) + except OSError, err: + if err.errno != errno.ENOENT: + raise + +class BaseIndex: + class LockedException(Exception): + pass + + def __init__(self, request): + self.request = request + cache_dir = request.cfg.cache_dir + main_dir = self._main_dir() + self.dir = os.path.join(main_dir, 'index') + filesys.makeDirs(self.dir) + self.sig_file = os.path.join(main_dir, 'complete') + lock_dir = os.path.join(main_dir, 'index-lock') + self.lock = lock.WriteLock(lock_dir, + timeout=3600.0, readlocktimeout=60.0) + self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) + self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), + os.path.join(main_dir, 'update-queue-lock')) + + # Disabled until we have a sane way to build the index with a + # queue in small steps. + ## if not self.exists(): + ## self.indexPagesInNewThread(request) + + def _main_dir(self): + raise NotImplemented + + def exists(self): + """ Check if index exists """ + return os.path.exists(self.sig_file) + + def mtime(self): + return os.path.getmtime(self.dir) + + def _search(self, query): + raise NotImplemented + + def search(self, query): + if not self.read_lock.acquire(1.0): + raise self.LockedException + try: + hits = self._search(query) + finally: + self.read_lock.release() + return hits + + def update_page(self, page): + self.queue.append(page.page_name) + self._do_queued_updates_InNewThread() + + def indexPages(self, files=None, mode='update'): + """ Index all pages (and files, if given) + + Can be called only from a script. To index pages during a user + request, use indexPagesInNewThread. + @arg files: iterator or list of files to index additionally + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + self._unsign() + start = time.time() + request = self._indexingRequest(self.request) + self._index_pages(request, files, mode) + request.log("indexing completed successfully in %0.2f seconds." % + (time.time() - start)) + self._sign() + finally: + self.lock.release() + + def indexPagesInNewThread(self, files=None, mode='update'): + """ Index all pages in a new thread + + Should be called from a user request. From a script, use indexPages. + """ + # Prevent rebuilding the index just after it was finished + if self.exists(): + return + + from threading import Thread + indexThread = Thread(target=self._index_pages, args=(files, mode)) + indexThread.setDaemon(True) + + # Join the index thread after current request finish, prevent + # Apache CGI from killing the process. + def joinDecorator(finish): + def func(): + finish() + indexThread.join() + return func + + self.request.finish = joinDecorator(self.request.finish) + indexThread.start() + + def _index_pages(self, request, files=None, mode='update'): + """ Index all pages (and all given files) + + This should be called from indexPages or indexPagesInNewThread only! + + This may take some time, depending on the size of the wiki and speed + of the machine. + + When called in a new thread, lock is acquired before the call, + and this method must release it when it finishes or fails. + """ + raise NotImplemented + + def _do_queued_updates_InNewThread(self): + """ do queued index updates in a new thread + + Should be called from a user request. From a script, use indexPages. + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + def lockedDecorator(self, f): + def func(*args, **kwargs): + try: + return f(*args, **kwargs) + finally: + self.lock.release() + return func + + from threading import Thread + indexThread = Thread( + target=lockedDecorator(self._do_queued_updates), + args=(self._indexingRequest(self.request),)) + indexThread.setDaemon(True) + + # Join the index thread after current request finish, prevent + # Apache CGI from killing the process. + def joinDecorator(finish): + def func(): + finish() + indexThread.join() + return func + + self.request.finish = joinDecorator(self.request.finish) + indexThread.start() + except: + self.lock.release() + raise + + def _do_queued_updates(self, request, amount=5): + raise NotImplemented + + def optimize(self): + raise NotImplemented + + def contentfilter(self, filename): + """ Get a filter for content of filename and return unicode content. """ + request = self.request + mt = wikiutil.MimeType(filename=filename) + for modulename in mt.module_name(): + try: + execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) + break + except wikiutil.PluginMissingError: + pass + else: + request.log("Cannot load filter for mimetype." + modulename) + try: + data = execute(self, filename) + # XXX: proper debugging? + #if debug: + # request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename)) + except (OSError, IOError), err: + data = '' + request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename)) + return mt.mime_type(), data + + def test(self, request): + raise NotImplemented + + def _indexingRequest(self, request): + """ Return a new request that can be used for index building. + + This request uses a security policy that lets the current user + read any page. Without this policy some pages will not render, + which will create broken pagelinks index. + """ + from MoinMoin.request.CLI import Request + from MoinMoin.security import Permissions + request = Request(request.url) + class SecurityPolicy(Permissions): + def read(*args, **kw): + return True + request.user.may = SecurityPolicy(request.user) + return request + + def _unsign(self): + """ Remove sig file - assume write lock acquired """ + try: + os.remove(self.sig_file) + except OSError, err: + if err.errno != errno.ENOENT: + raise + + def _sign(self): + """ Add sig file - assume write lock acquired """ + f = file(self.sig_file, 'w') + try: + f.write('') + finally: + f.close() ############################################################################## ### Searching @@ -60,8 +380,9 @@ class Search: """ pages = None try: - index = Xapian.Index(self.request) - except NameError: + from MoinMoin.search.Xapian import Index + index = Index(self.request) + except ImportError: index = None if index and index.exists() and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') @@ -80,7 +401,7 @@ class Search: return d pages = [dict_decode(hit['values']) for hit in hits] self.request.log("xapianSearch: finds pages: %r" % pages) - except index.LockedException: + except BaseIndex.LockedException: pass self.request.clock.stop('_xapianSearch') return self._moinSearch(pages) diff -r a2498260eca5 -r 45e286183872 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jun 27 15:20:08 2006 +0200 +++ b/docs/CHANGES.fpletz Wed Jun 28 23:41:46 2006 +0200 @@ -91,5 +91,12 @@ 2006-06-21 2006-06-27 * Splitting out MoinMoin/search.py to MoinMoin/search/*.py, no more - need to invoke QueryParser manually when using searchPages + need to invoke QueryParser manually when using searchPages, minor + refactoring +2006-06-28 + * Abstraction of a locked search engine index: + MoinMoin.search.builtin.BaseIndex, MoinMoin.search.Xapian.Index is + derived from this, cleanups in calling structure and function + prototypes to make it more extensible + # HG changeset patch # User Franz Pletz # Date 1151531578 -7200 # Node ID 63e0bd0eea9881426eb2692f6bfc3c1f62e88266 # Parent 45e2861838727ea36e73d304bedf374678a6a757 Bugfix for PageEditor.py diff -r 45e286183872 -r 63e0bd0eea98 MoinMoin/PageEditor.py --- a/MoinMoin/PageEditor.py Wed Jun 28 23:41:46 2006 +0200 +++ b/MoinMoin/PageEditor.py Wed Jun 28 23:52:58 2006 +0200 @@ -968,8 +968,8 @@ Please review the page and save then. Do msg = msg + self._notifySubscribers(comment, trivial) if self.request.cfg.xapian_search: - from MoinMoin import Xapian - index = Xapian.Index(self.request) + from MoinMoin.search.Xapian import Index + index = Index(self.request) # When we have automatic index building, we can add to # the queue even if the index is missing. if index.exists(): # HG changeset patch # User Franz Pletz # Date 1151786834 -7200 # Node ID 22f6f589162a592c351a09a88e6184e111871017 # Parent f16cf67d34401247ce7b0d101800a45a9bcb0f7a term-based regexp search diff -r f16cf67d3440 -r 22f6f589162a MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sat Jul 01 20:18:39 2006 +0200 +++ b/MoinMoin/search/Xapian.py Sat Jul 01 22:47:14 2006 +0200 @@ -209,13 +209,12 @@ class Index(BaseIndex): self.queue.remove([name]) writer.close() - # XXX: why? - #def test(self, request): - # idx = xapidx.ReadOnlyIndex(self.dir) - # idx.configure(self.prefixMap, self.indexValueMap) - # print idx.search("is") - # #for d in docs: - # # request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) + def allterms(self): + db = xapidx.ExceptionTranslater.openIndex(True, self.dir) + i = db.allterms_begin() + while i != db.allterms_end(): + yield i.get_term() + i.next() def _index_file(self, request, writer, filename, mode='update'): """ index a file as it were a page named pagename diff -r f16cf67d3440 -r 22f6f589162a MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Sat Jul 01 20:18:39 2006 +0200 +++ b/MoinMoin/search/builtin.py Sat Jul 01 22:47:14 2006 +0200 @@ -384,11 +384,12 @@ class Search: index = Index(self.request) except ImportError: index = None - if index and index.exists() and self.query.xapian_wanted(): + if index and index.exists(): #and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap - query = self.query.xapian_term(self.request) + query = self.query.xapian_term(self.request, + index.allterms) self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) @@ -403,6 +404,8 @@ class Search: self.request.log("xapianSearch: finds pages: %r" % pages) except BaseIndex.LockedException: pass + #except AttributeError: + # pages = [] self.request.clock.stop('_xapianSearch') return self._moinSearch(pages) diff -r f16cf67d3440 -r 22f6f589162a MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Sat Jul 01 20:18:39 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sat Jul 01 22:47:14 2006 +0200 @@ -177,15 +177,15 @@ class AndExpression(BaseExpression): wanted = wanted and term.xapian_wanted() return wanted - def xapian_term(self, request): + def xapian_term(self, request, allterms): # sort negated terms terms = [] not_terms = [] for term in self._subterms: if not term.negated: - terms.append(term.xapian_term(request)) + terms.append(term.xapian_term(request, allterms)) else: - not_terms.append(term.xapian_term(request)) + not_terms.append(term.xapian_term(request, allterms)) # prepare query for not negated terms if len(terms) == 1: @@ -226,9 +226,9 @@ class OrExpression(AndExpression): matches.extend(result) return matches - def xapian_term(self, request): + def xapian_term(self, request, allterms): # XXX: negated terms managed by _moinSearch? - return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms]) + return Query(Query.OP_OR, [term.xapian_term(request, allterms) for term in self._subterms]) class TextSearch(BaseExpression): @@ -303,9 +303,14 @@ class TextSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self, request): + def xapian_term(self, request, allterms): if self.use_re: - return None # xapian can't do regex search + # basic regex matching per term + terms = [term for term in allterms() if + self.search_re.match(term)] + if not terms: + return None + queries = [Query(Query.OP_OR, terms)] else: analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default) @@ -331,10 +336,10 @@ class TextSearch(BaseExpression): self._build_re(' '.join(stemmed), use_re=False, case=self.case, stemmed=True) - # titlesearch OR parsed wikiwords - return Query(Query.OP_OR, - (self.titlesearch.xapian_term(request), - Query(Query.OP_AND, queries))) + # titlesearch OR parsed wikiwords + return Query(Query.OP_OR, + (self.titlesearch.xapian_term(request, allterms), + Query(Query.OP_AND, queries))) class TitleSearch(BaseExpression): @@ -406,9 +411,14 @@ class TitleSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self, request): + def xapian_term(self, request, allterms): if self.use_re: - return None # xapian doesn't support regex search + # basic regex matching per term + terms = [term for term in allterms() if + self.search_re.match(term)] + if not terms: + return None + queries = [Query(Query.OP_OR, terms)] else: analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default) @@ -438,7 +448,7 @@ class TitleSearch(BaseExpression): self._build_re(' '.join(stemmed), use_re=False, case=self.case, stemmed=True) - return Query(Query.OP_AND, queries) + return Query(Query.OP_AND, queries) class LinkSearch(BaseExpression): @@ -464,12 +474,10 @@ class LinkSearch(BaseExpression): def _build_re(self, pattern, use_re=False, case=False): """ Make a regular expression out of a text pattern """ flags = case and re.U or (re.I | re.U) - try: - if not use_re: - raise re.error + if use_re: self.search_re = re.compile(pattern, flags) self.static = False - except re.error: + else: self.pattern = pattern self.static = True @@ -516,13 +524,26 @@ class LinkSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self, request): - pattern = self.pattern + def xapian_term(self, request, allterms): + prefix = Xapian.Index.prefixMap['linkto'] if self.use_re: - return None # xapian doesnt support regex search - else: - return UnicodeQuery('%s:%s' % - (Xapian.Index.prefixMap['linkto'], pattern)) + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n+1:]): + terms.append(term) + elif found: + continue + + if not terms: + return None + return Query(Query.OP_OR, terms) + else: + return UnicodeQuery('%s:%s' % (prefix, self.pattern)) class LanguageSearch(BaseExpression): @@ -563,14 +584,28 @@ class LanguageSearch(BaseExpression): def xapian_wanted(self): return not self.use_re - def xapian_term(self, request): - pattern = self.pattern + def xapian_term(self, request, allterms): + self.xapian_called = True + prefix = Xapian.Index.prefixMap['lang'] if self.use_re: - return None # xapian doesnt support regex search - else: - self.xapian_called = True - return UnicodeQuery('%s%s' % - (Xapian.Index.prefixMap['lang'], pattern)) + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n:]): + terms.append(term) + elif found: + continue + + if not terms: + return None + return Query(Query.OP_OR, terms) + else: + pattern = self.pattern + return UnicodeQuery('%s%s' % (prefix, pattern)) ############################################################################## diff -r f16cf67d3440 -r 22f6f589162a docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jul 01 20:18:39 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Jul 01 22:47:14 2006 +0200 @@ -2,11 +2,12 @@ Branch moin/1.6-xapian-fpletz ============================= Known main issues: - * Regex searching with Xapian? + * Only term-based regex searching possible, modifier or heuristic to + enable usage of _moinSearch for full compatibility? * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) ToDo: - * Mockup the new search UI + * Implement the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) * Drop _moinSearch when using Xapian and use term positions provided @@ -100,3 +101,9 @@ 2006-06-28 derived from this, cleanups in calling structure and function prototypes to make it more extensible +2006-06-29 + * Tested some ideas with regexp searching + +2006-07-01 + * Fully implemented term-based regexp searching + # HG changeset patch # User Franz Pletz # Date 1152094762 -7200 # Node ID 4508fc92fcb1e121ae33d56f35035d4794986204 # Parent 22f6f589162a592c351a09a88e6184e111871017 index exact positions of terms (postings) diff -r 22f6f589162a -r 4508fc92fcb1 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sat Jul 01 22:47:14 2006 +0200 +++ b/MoinMoin/search/Xapian.py Wed Jul 05 12:19:22 2006 +0200 @@ -92,38 +92,47 @@ class WikiAnalyzer: tokenstream = re.finditer(self.token_re, value) for m in tokenstream: if m.group("acronym"): - yield enc(m.group("acronym").replace('.', '')) + yield (enc(m.group("acronym").replace('.', '')), + m.start()) elif m.group("company"): - yield enc(m.group("company")) + yield (enc(m.group("company")), m.start()) elif m.group("email"): + displ = 0 for word in self.mail_re.split(m.group("email")): if word: - yield enc(word) + yield (enc(word), m.start() + displ) + displ += len(word) + 1 elif m.group("hostname"): + displ = 0 for word in self.dot_re.split(m.group("hostname")): - yield enc(word) + yield (enc(word), m.start() + displ) + displ += len(word) + 1 elif m.group("num"): + displ = 0 for word in self.dot_re.split(m.group("num")): - yield enc(word) + yield (enc(word), m.start() + displ) + displ += len(word) + 1 elif m.group("word"): word = m.group("word") - yield enc(word) + yield (enc(word), m.start()) # if it is a CamelCaseWord, we additionally yield Camel, Case and Word if self.wikiword_re.match(word): for sm in re.finditer(self.singleword_re, word): - yield enc(sm.group()) + yield (enc(sm.group()), m.start() + sm.start()) def tokenize(self, value, flat_stemming=True): """Yield a stream of lower cased raw and stemmed (optional) words from a string. value must be an UNICODE object or a list of unicode objects """ - for i in self.raw_tokenize(value): + for word, pos in self.raw_tokenize(value): if flat_stemming: - yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i + # XXX: should we really use a prefix for that? + # Index.prefixMap['raw'] + i + yield (word, pos) if self.stemmer: - yield self.stemmer.stemWord(i) + yield (self.stemmer.stemWord(word), pos) else: - yield (i, self.stemmer.stemWord(i)) + yield (i, self.stemmer.stemWord(i), pos) ############################################################################# diff -r 22f6f589162a -r 4508fc92fcb1 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Sat Jul 01 22:47:14 2006 +0200 +++ b/MoinMoin/search/queryparser.py Wed Jul 05 12:19:22 2006 +0200 @@ -323,13 +323,13 @@ class TextSearch(BaseExpression): if request.cfg.xapian_stemming: # stemmed OR not stemmed tmp = [] - for i in analyzer.tokenize(t, flat_stemming=False): - tmp.append(UnicodeQuery(Query.OP_OR, i)) - stemmed.append(i[1]) + for w, s, pos in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, (w, s))) + stemmed.append(w) t = tmp else: # just not stemmed - t = [UnicodeQuery(i) for i in analyzer.tokenize(t)] + t = [UnicodeQuery(w) for w, pos in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) if stemmed: @@ -423,7 +423,7 @@ class TitleSearch(BaseExpression): analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default) terms = self._pattern.split() - terms = [list(analyzer.raw_tokenize(t)) for t in terms] + terms = [[w for w, pos in analyzer.raw_tokenize(t)] for t in terms] # all parsed wikiwords, AND'ed queries = [] @@ -432,15 +432,16 @@ class TitleSearch(BaseExpression): if request.cfg.xapian_stemming: # stemmed OR not stemmed tmp = [] - for i in analyzer.tokenize(t, flat_stemming=False): - tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' % - (Xapian.Index.prefixMap['title'], j) for j in i])) - stemmed.append(i[1]) + for w, s, pos in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, + ['%s%s' % (Xapian.Index.prefixMap['title'], j) + for j in (w, s)])) + stemmed.append(w) t = tmp else: # just not stemmed - t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i)) - for i in analyzer.tokenize(t)] + t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], w)) + for w, pos in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) diff -r 22f6f589162a -r 4508fc92fcb1 MoinMoin/support/xapwrap/document.py --- a/MoinMoin/support/xapwrap/document.py Sat Jul 01 22:47:14 2006 +0200 +++ b/MoinMoin/support/xapwrap/document.py Wed Jul 05 12:19:22 2006 +0200 @@ -140,12 +140,16 @@ class Document: def toXapianDocument(self, indexValueMap, prefixMap=None): d = xapian.Document() - position = 1 + position = 0 analyzer = self.analyzerFactory() # add text fields for field in self.textFields: for token in analyzer.tokenize(field.text): + if isinstance(token, tuple): + token, position = token + else: + position += 1 # the xapian swig bindings don't like unicode objects, so we # decode terms to UTF-8 before indexing. this is fine as # long as all data that goes into the db (whether for @@ -159,12 +163,13 @@ class Document: # the process, the string length could expand, so we # need to check here as well. d.add_posting(checkKeyLen(token), position) - position += 1 position += INTER_FIELD_POSITION_GAP if field.prefix: prefix = field.name for token in analyzer.tokenize(field.text): + if isinstance(token, tuple): + token = token[0] # token is unicode, but gets converted to UTF-8 # by makePairForWrite: term = makePairForWrite(prefix, token, prefixMap) # HG changeset patch # User Franz Pletz # Date 1152268134 -7200 # Node ID 134b5ee9904620032da2f1f8681b5978f7c57298 # Parent 4508fc92fcb1e121ae33d56f35035d4794986204 basic fetching of matches for terms with xapian diff -r 4508fc92fcb1 -r 134b5ee99046 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Wed Jul 05 12:19:22 2006 +0200 +++ b/MoinMoin/search/Xapian.py Fri Jul 07 12:28:54 2006 +0200 @@ -87,7 +87,7 @@ class WikiAnalyzer: if isinstance(value, list): # used for page links for v in value: - yield enc(v) + yield (enc(v), 0) else: tokenstream = re.finditer(self.token_re, value) for m in tokenstream: @@ -132,7 +132,7 @@ class WikiAnalyzer: if self.stemmer: yield (self.stemmer.stemWord(word), pos) else: - yield (i, self.stemmer.stemWord(i), pos) + yield (word, self.stemmer.stemWord(word), pos) ############################################################################# @@ -224,6 +224,13 @@ class Index(BaseIndex): while i != db.allterms_end(): yield i.get_term() i.next() + + def termpositions(self, uid, term): + db = xapidx.ExceptionTranslater.openIndex(True, self.dir) + pos = db.positionlist_begin(uid, term) + while pos != db.positionlist_end(uid, term): + yield pos.get_termpos() + pos.next() def _index_file(self, request, writer, filename, mode='update'): """ index a file as it were a page named pagename diff -r 4508fc92fcb1 -r 134b5ee99046 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Wed Jul 05 12:19:22 2006 +0200 +++ b/MoinMoin/search/builtin.py Fri Jul 07 12:28:54 2006 +0200 @@ -15,6 +15,7 @@ from MoinMoin.Page import Page from MoinMoin.Page import Page from MoinMoin.util import filesys, lock from MoinMoin.search.results import getSearchResults +from MoinMoin.search.queryparser import TextMatch, TitleMatch ############################################################################## # Search Engine Abstraction @@ -384,30 +385,47 @@ class Search: index = Index(self.request) except ImportError: index = None + if index and index.exists(): #and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap - query = self.query.xapian_term(self.request, - index.allterms) + query = self.query.xapian_term(self.request, index.allterms) self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) - hits = index.search(query) + enq, hits = index.search(query) self.request.log("xapianSearch: finds: %r" % hits) def dict_decode(d): """ decode dict values to unicode """ for k, v in d.items(): d[k] = d[k].decode(config.charset) return d - pages = [dict_decode(hit['values']) for hit in hits] + pages = [{'uid': hit['uid'], 'values': dict_decode(hit['values'])} + for hit in hits] self.request.log("xapianSearch: finds pages: %r" % pages) + self._xapianEnquire = enq + self._xapianIndex = index except BaseIndex.LockedException: pass #except AttributeError: # pages = [] self.request.clock.stop('_xapianSearch') - return self._moinSearch(pages) + return self._getHits(hits, self._xapianMatch) + else: + return self._moinSearch(pages) + + def _xapianMatch(self, page, uid): + matches = [] + term = self._xapianEnquire.get_matching_terms_begin(uid) + #print hit['uid'] + while term != self._xapianEnquire.get_matching_terms_end(uid): + print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term())) + for pos in self._xapianIndex.termpositions(uid, term.get_term()): + matches.append(TextMatch(start=pos, + end=pos+len(term.get_term()))) + term.next() + return matches def _moinSearch(self, pages=None): """ Search pages using moin's built-in full text search @@ -421,9 +439,23 @@ class Search: # if we are not called from _xapianSearch, we make a full pagelist, # but don't search attachments (thus attachment name = '') pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()] + hits = self._getHits(pages, self._moinMatch) + self.request.clock.stop('_moinSearch') + return hits + + def _moinMatch(self, page, uid): + return self.query.search(page) + + def _getHits(self, pages, matchSearchFunction): hits = [] fs_rootpage = self.fs_rootpage - for valuedict in pages: + for hit in pages: + if 'values' in hit: + valuedict = hit['values'] + uid = hit['uid'] + else: + valuedict = hit + wikiname = valuedict['wikiname'] pagename = valuedict['pagename'] attachment = valuedict['attachment'] @@ -436,12 +468,11 @@ class Search: else: hits.append((wikiname, page, attachment, None)) else: - match = self.query.search(page) + match = matchSearchFunction(page, uid) if match: hits.append((wikiname, page, attachment, match)) else: # other wiki hits.append((wikiname, pagename, attachment, None)) - self.request.clock.stop('_moinSearch') return hits def _getPageList(self): diff -r 4508fc92fcb1 -r 134b5ee99046 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Wed Jul 05 12:19:22 2006 +0200 +++ b/MoinMoin/search/queryparser.py Fri Jul 07 12:28:54 2006 +0200 @@ -340,7 +340,6 @@ class TextSearch(BaseExpression): return Query(Query.OP_OR, (self.titlesearch.xapian_term(request, allterms), Query(Query.OP_AND, queries))) - class TitleSearch(BaseExpression): """ Term searches in pattern in page title only """ diff -r 4508fc92fcb1 -r 134b5ee99046 MoinMoin/support/xapwrap/index.py --- a/MoinMoin/support/xapwrap/index.py Wed Jul 05 12:19:22 2006 +0200 +++ b/MoinMoin/support/xapwrap/index.py Fri Jul 07 12:28:54 2006 +0200 @@ -635,7 +635,7 @@ class ReadOnlyIndex: valRes[valName] = xapDoc.get_value(valueIndex) thisResult['values'] = valRes results.append(thisResult) - return results + return enq, results except: del enq, mset raise diff -r 4508fc92fcb1 -r 134b5ee99046 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Wed Jul 05 12:19:22 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jul 07 12:28:54 2006 +0200 @@ -107,3 +107,18 @@ 2006-07-01 2006-07-01 * Fully implemented term-based regexp searching +2006-07-04 + * Evaluating the current framework for the new UI (no new sane code to + commit) + +2006-07-05 + * Indexing correct positions in xapwrap + +2006-07-06 + * Played with Xapian to get correct positions and where to integrate + in MoinMoin + +2006-07-07 + * Basic (quick and dirty, limitations and bugs included, but + commit-ready) implementation of getting matches out of the Xapian DB + # HG changeset patch # User Franz Pletz # Date 1152875835 -7200 # Node ID 248789a3f15571ba8011116b1b839edf15d5f4c6 # Parent 28ae528ca238f4115b4639a70dacca49124b8950 improving positions fetched from xapian, TitleMatch support, bugfixes for the current code diff -r 28ae528ca238 -r 248789a3f155 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Fri Jul 07 12:30:34 2006 +0200 +++ b/MoinMoin/search/builtin.py Fri Jul 14 13:17:15 2006 +0200 @@ -415,17 +415,25 @@ class Search: else: return self._moinSearch(pages) + def _xapianMatchDecider(self, term, pos): + if term[0] == 'S': # TitleMatch + return TitleMatch(start=pos, end=pos+len(term)-1) + else: # TextMatch (incl. headers) + return TextMatch(start=pos, end=pos+len(term)) + def _xapianMatch(self, page, uid): - matches = [] + """ Get all relevant Xapian matches per document id """ + positions = {} term = self._xapianEnquire.get_matching_terms_begin(uid) - #print hit['uid'] while term != self._xapianEnquire.get_matching_terms_end(uid): - print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term())) - for pos in self._xapianIndex.termpositions(uid, term.get_term()): - matches.append(TextMatch(start=pos, - end=pos+len(term.get_term()))) + term_name = term.get_term() + for pos in self._xapianIndex.termpositions(uid,term.get_term()): + if pos not in positions or \ + len(positions[pos]) < len(term_name): + positions[pos] = term_name term.next() - return matches + return [self._xapianMatchDecider(term, pos) for pos, term + in positions.iteritems()] def _moinSearch(self, pages=None): """ Search pages using moin's built-in full text search @@ -444,9 +452,11 @@ class Search: return hits def _moinMatch(self, page, uid): + """ Just kick off regular moinSearch """ return self.query.search(page) def _getHits(self, pages, matchSearchFunction): + """ Get the hit tuples in pages through matchSearchFunction """ hits = [] fs_rootpage = self.fs_rootpage for hit in pages: @@ -455,6 +465,7 @@ class Search: uid = hit['uid'] else: valuedict = hit + uid = None wikiname = valuedict['wikiname'] pagename = valuedict['pagename'] @@ -468,9 +479,9 @@ class Search: else: hits.append((wikiname, page, attachment, None)) else: - match = matchSearchFunction(page, uid) - if match: - hits.append((wikiname, page, attachment, match)) + matches = matchSearchFunction(page, uid) + if matches: + hits.append((wikiname, page, attachment, matches)) else: # other wiki hits.append((wikiname, pagename, attachment, None)) return hits diff -r 28ae528ca238 -r 248789a3f155 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Fri Jul 07 12:30:34 2006 +0200 +++ b/MoinMoin/search/results.py Fri Jul 14 13:17:15 2006 +0200 @@ -494,7 +494,8 @@ class SearchResults: start = len(header) # Find first match after start for i in xrange(len(matches)): - if matches[i].start >= start: + if matches[i].start >= start and \ + isinstance(matches[i], TextMatch): return i, start return 0, 0 diff -r 28ae528ca238 -r 248789a3f155 MoinMoin/support/xapwrap/document.py --- a/MoinMoin/support/xapwrap/document.py Fri Jul 07 12:30:34 2006 +0200 +++ b/MoinMoin/support/xapwrap/document.py Fri Jul 14 13:17:15 2006 +0200 @@ -145,6 +145,9 @@ class Document: # add text fields for field in self.textFields: + # XXX: terms textFields won't get numbered + # after each other, needed for titles + position = 0 for token in analyzer.tokenize(field.text): if isinstance(token, tuple): token, position = token @@ -163,19 +166,20 @@ class Document: # the process, the string length could expand, so we # need to check here as well. d.add_posting(checkKeyLen(token), position) - position += INTER_FIELD_POSITION_GAP + #position += INTER_FIELD_POSITION_GAP if field.prefix: prefix = field.name for token in analyzer.tokenize(field.text): if isinstance(token, tuple): - token = token[0] + token, position = token + else: + position += 1 # token is unicode, but gets converted to UTF-8 # by makePairForWrite: term = makePairForWrite(prefix, token, prefixMap) d.add_posting(term, position) - position += 1 - position += INTER_FIELD_POSITION_GAP + #position += INTER_FIELD_POSITION_GAP # add keyword fields for field in self.keywords: diff -r 28ae528ca238 -r 248789a3f155 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Fri Jul 07 12:30:34 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jul 14 13:17:15 2006 +0200 @@ -5,14 +5,15 @@ Branch moin/1.6-xapian-fpletz * Only term-based regex searching possible, modifier or heuristic to enable usage of _moinSearch for full compatibility? * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) + * Positions saved in Xapian aren't always correct, check. Code + generally needs some more love. ToDo: * Implement the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) - * Drop _moinSearch when using Xapian and use term positions provided - by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to - get the position of stemmed words right + * Reevaluate Xapwrap, possibly drop it and rip out usable stuff + (i.e. ExceptionTranslator) New Features: * Faster search thanks to Xapian @@ -122,3 +123,24 @@ 2006-07-07 * Basic (quick and dirty, limitations and bugs included, but commit-ready) implementation of getting matches out of the Xapian DB +2006-07-08 + * No work: daytrip to Munich + +2006-07-09 + * Bugfix for _moinSearch (not using Xapian) + +2006-07-11 + * Make matches which we get from Xapian more reliable + * Add TitleMatch support + * Xapwrap needed some tuning (aka hacking), think about dropping + and/or rewriting much of its code as it doesn't always fit (and + probably won't in the future) + +2006-07-12 +2006-07-13 + * No work + +2006-07-14 + * Minor bugfix for TitleMatch, now works correctly + * First interesting match must be a TextMatch + # HG changeset patch # User Franz Pletz # Date 1152882754 -7200 # Node ID 72aeb2ba133d7b5524873cde24382722f762f067 # Parent 248789a3f15571ba8011116b1b839edf15d5f4c6 support complete rebuild of the index diff -r 248789a3f155 -r 72aeb2ba133d MoinMoin/script/index/build.py --- a/MoinMoin/script/index/build.py Fri Jul 14 13:17:15 2006 +0200 +++ b/MoinMoin/script/index/build.py Fri Jul 14 15:12:34 2006 +0200 @@ -22,7 +22,7 @@ class IndexScript(MoinScript): ) self.parser.add_option( "--mode", metavar="MODE", dest="mode", - help="either add (unconditionally add to index) or update (update an existing index)" + help="either add (unconditionally add to index), update (update an existing index) or rebuild (remove and add)" ) def mainloop(self): @@ -40,5 +40,4 @@ class PluginScript(IndexScript): def command(self): from MoinMoin.search.Xapian import Index Index(self.request).indexPages(self.files, self.options.mode) - #Index(self.request).test(self.request) diff -r 248789a3f155 -r 72aeb2ba133d MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Fri Jul 14 13:17:15 2006 +0200 +++ b/MoinMoin/search/Xapian.py Fri Jul 14 15:12:34 2006 +0200 @@ -8,7 +8,7 @@ """ debug = True -import sys, os, re, codecs, time +import sys, os, re, codecs, time, os from pprint import pprint import xapian @@ -237,6 +237,13 @@ class Index(BaseIndex): Assumes that the write lock is acquired """ fs_rootpage = 'FS' # XXX FS hardcoded + + # rebuilding the DB: delete it and add everything + if mode == 'rebuild': + for f in os.listdir(self.dir): + os.unlink(f) + mode = 'add' + try: wikiname = request.cfg.interwikiname or 'Self' itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename)) diff -r 248789a3f155 -r 72aeb2ba133d MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Fri Jul 14 13:17:15 2006 +0200 +++ b/MoinMoin/search/builtin.py Fri Jul 14 15:12:34 2006 +0200 @@ -149,7 +149,7 @@ class BaseIndex: lock_dir = os.path.join(main_dir, 'index-lock') self.lock = lock.WriteLock(lock_dir, timeout=3600.0, readlocktimeout=60.0) - self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) + #self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), os.path.join(main_dir, 'update-queue-lock')) @@ -172,12 +172,12 @@ class BaseIndex: raise NotImplemented def search(self, query): - if not self.read_lock.acquire(1.0): - raise self.LockedException - try: - hits = self._search(query) - finally: - self.read_lock.release() + #if not self.read_lock.acquire(1.0): + # raise self.LockedException + #try: + hits = self._search(query) + #finally: + # self.read_lock.release() return hits def update_page(self, page): diff -r 248789a3f155 -r 72aeb2ba133d docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Fri Jul 14 13:17:15 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jul 14 15:12:34 2006 +0200 @@ -14,6 +14,9 @@ Branch moin/1.6-xapian-fpletz * Indexing and searching of categories (new term prefix) * Reevaluate Xapwrap, possibly drop it and rip out usable stuff (i.e. ExceptionTranslator) + * Add stemming support for highlighting stuff: + 1. regexp for whole word (all lowercase), or + 2. just the root of the word New Features: * Faster search thanks to Xapian @@ -143,4 +146,6 @@ 2006-07-14 2006-07-14 * Minor bugfix for TitleMatch, now works correctly * First interesting match must be a TextMatch + * Comment read_lock code from BaseIndex (should not be needed) + * Support complete rebuild of the database (delete and add) # HG changeset patch # User Franz Pletz # Date 1153131898 -7200 # Node ID f472ddeba121abdb683c15793b3420fc0c57aa0c # Parent b8c1bb917748673ecdc2571d3fa74ab27a8b577c SystemInfo macro extended with the state of the index, ensure fallback to moinSearch diff -r b8c1bb917748 -r f472ddeba121 MoinMoin/macro/SystemInfo.py --- a/MoinMoin/macro/SystemInfo.py Mon Jul 17 12:17:39 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Mon Jul 17 12:24:58 2006 +0200 @@ -112,7 +112,9 @@ def execute(Macro, args): ', '.join(wikiutil.wikiPlugins('parser', Macro.cfg)) or nonestr) state = (_('Disabled'), _('Enabled')) - row(_('Xapian search'), state[request.cfg.xapian_search]) + from MoinMoin.search.builtin import Search + row(_('Xapian search'), '%s, %sactive' % (state[request.cfg.xapian_search], + not Search._xapianIndex(request) and 'not ' or '')) row(_('Active threads'), t_count or 'N/A') buf.write(u'') diff -r b8c1bb917748 -r f472ddeba121 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Mon Jul 17 12:17:39 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 17 12:24:58 2006 +0200 @@ -187,6 +187,10 @@ class Index(BaseIndex): self.request.cfg.siteid) else: return os.path.join(self.request.cfg.cache_dir, 'xapian') + + def exists(self): + """ Check if the Xapian index exists """ + return BaseIndex.exists(self) and os.listdir(self.dir) def _search(self, query): """ read lock must be acquired """ diff -r b8c1bb917748 -r f472ddeba121 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Mon Jul 17 12:17:39 2006 +0200 +++ b/MoinMoin/search/builtin.py Mon Jul 17 12:24:58 2006 +0200 @@ -373,6 +373,18 @@ class Search: # ---------------------------------------------------------------- # Private! + def _xapianIndex(request): + try: + from MoinMoin.search.Xapian import Index + index = Index(request) + except ImportError: + index = None + + if index and index.exists(): + return index + + _xapianIndex = staticmethod(_xapianIndex) + def _xapianSearch(self): """ Search using Xapian @@ -380,13 +392,8 @@ class Search: return moin search in those pages. """ pages = None - try: - from MoinMoin.search.Xapian import Index - index = Index(self.request) - except ImportError: - index = None - - if index and index.exists(): #and self.query.xapian_wanted(): + index = self._xapianIndex(self.request) + if index: #and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap diff -r b8c1bb917748 -r f472ddeba121 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Mon Jul 17 12:17:39 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jul 17 12:24:58 2006 +0200 @@ -149,3 +149,7 @@ 2006-07-14 * Comment read_lock code from BaseIndex (should not be needed) * Support complete rebuild of the database (delete and add) +2006-07-17 + * SystemInfo macro now also shows if xapian is being used (index + available) and more graceful fallback to moinSearch + # HG changeset patch # User Franz Pletz # Date 1153134722 -7200 # Node ID dbb3bf01ae19d95b598971d6ecece2042c2388da # Parent f472ddeba121abdb683c15793b3420fc0c57aa0c the index rebuild code was in the wrong spot diff -r f472ddeba121 -r dbb3bf01ae19 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Mon Jul 17 12:24:58 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 17 13:12:02 2006 +0200 @@ -241,12 +241,6 @@ class Index(BaseIndex): Assumes that the write lock is acquired """ fs_rootpage = 'FS' # XXX FS hardcoded - - # rebuilding the DB: delete it and add everything - if mode == 'rebuild': - for f in os.listdir(self.dir): - os.unlink(f) - mode = 'add' try: wikiname = request.cfg.interwikiname or 'Self' @@ -444,6 +438,13 @@ class Index(BaseIndex): When called in a new thread, lock is acquired before the call, and this method must release it when it finishes or fails. """ + + # rebuilding the DB: delete it and add everything + if mode == 'rebuild': + for f in os.listdir(self.dir): + os.unlink(f) + mode = 'add' + try: writer = xapidx.Index(self.dir, True) writer.configure(self.prefixMap, self.indexValueMap) # HG changeset patch # User Franz Pletz # Date 1153135360 -7200 # Node ID 541271bb8a5620d0bc1b85ead95367c6d2ca8a82 # Parent dbb3bf01ae19d95b598971d6ecece2042c2388da fix for rebuilding the index again, get the full path for each file diff -r dbb3bf01ae19 -r 541271bb8a56 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Mon Jul 17 13:12:02 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 17 13:22:40 2006 +0200 @@ -442,7 +442,7 @@ class Index(BaseIndex): # rebuilding the DB: delete it and add everything if mode == 'rebuild': for f in os.listdir(self.dir): - os.unlink(f) + os.unlink(os.path.join(self.dir, f)) mode = 'add' try: # HG changeset patch # User Franz Pletz # Date 1153229762 -7200 # Node ID 3172214eac968cf7da8c4213255fe86d05f0bc18 # Parent 541271bb8a5620d0bc1b85ead95367c6d2ca8a82 Fixed some bugs, whitespaces at EOL, better i18n for SystemInfo diff -r 541271bb8a56 -r 3172214eac96 MoinMoin/formatter/text_html.py --- a/MoinMoin/formatter/text_html.py Mon Jul 17 13:22:40 2006 +0200 +++ b/MoinMoin/formatter/text_html.py Tue Jul 18 15:36:02 2006 +0200 @@ -388,7 +388,7 @@ class Formatter(FormatterBase): if newline: result.append(self._newline()) result.append('' % (tag)) - tagstr = ''.join(result) + tagstr = ''.join(result) else: # Inline elements # Pull from stack, ignore order, that is not our problem. @@ -455,7 +455,7 @@ class Formatter(FormatterBase): result = [] result.append(self.anchordef(aid)) result.append(self._close('div', newline=newline)) - return ''.join(result) + return ''.join(result) def lang(self, on, lang_name): """ Insert text with specific lang and direction. @@ -826,7 +826,7 @@ class Formatter(FormatterBase): """ tag = 'tt' # Maybe we don't need this, because we have tt will be in inlineStack. - self._in_code = on + self._in_code = on if on: return self._open(tag, allowed_attrs=[], **kw) return self._close(tag) @@ -1303,7 +1303,7 @@ document.write(' # Date 1153235702 -7200 # Node ID 1bdac55acc14d44f576fde5f85be8ccaa1312269 # Parent 3172214eac968cf7da8c4213255fe86d05f0bc18 paging support for fullsearch action, fullsearch macro broken diff -r 3172214eac96 -r 1bdac55acc14 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Tue Jul 18 15:36:02 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Tue Jul 18 17:15:02 2006 +0200 @@ -42,6 +42,7 @@ def execute(pagename, request, fieldname needle = request.form.get(fieldname, [''])[0] case = int(request.form.get('case', [0])[0]) regex = int(request.form.get('regex', [0])[0]) # no interface currently + hitsFrom = int(request.form.get('from', [0])[0]) max_context = 1 # only show first `max_context` contexts XXX still unused @@ -94,15 +95,16 @@ def execute(pagename, request, fieldname request.write(request.formatter.startContent("content")) # First search stats - request.write(results.stats(request, request.formatter)) + request.write(results.stats(request, request.formatter, hitsFrom)) # Then search results info = not titlesearch if context: output = results.pageListWithContext(request, request.formatter, info=info, - context=context) + context=context, hitsFrom=hitsFrom) else: - output = results.pageList(request, request.formatter, info=info) + output = results.pageList(request, request.formatter, info=info, + hitsFrom=hitsFrom) request.write(output) request.write(request.formatter.endContent()) diff -r 3172214eac96 -r 1bdac55acc14 MoinMoin/multiconfig.py --- a/MoinMoin/multiconfig.py Tue Jul 18 15:36:02 2006 +0200 +++ b/MoinMoin/multiconfig.py Tue Jul 18 17:15:02 2006 +0200 @@ -295,6 +295,7 @@ reStructuredText Quick Reference xapian_search = False # disabled until xapian is finished xapian_index_dir = None xapian_stemming = True + search_results_per_page = 10 mail_login = None # or "user pwd" if you need to use SMTP AUTH mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail diff -r 3172214eac96 -r 1bdac55acc14 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Jul 18 15:36:02 2006 +0200 +++ b/MoinMoin/search/results.py Tue Jul 18 17:15:02 2006 +0200 @@ -10,7 +10,7 @@ @license: GNU GPL, see COPYING for details """ -import StringIO, time +import StringIO, time, re from MoinMoin import config, wikiutil from MoinMoin.Page import Page @@ -266,31 +266,37 @@ class SearchResults: self.hits = [item[1] for item in tmp] self.sort = 'page_name' - def stats(self, request, formatter): + def stats(self, request, formatter, hitsFrom): """ Return search statistics, formatted with formatter @param request: current request @param formatter: formatter to use + @param hitsFrom: current position in the hits @rtype: unicode @return formatted statistics """ _ = request.getText output = [ formatter.paragraph(1), - formatter.text(_("%(hits)d results out of about %(pages)d pages.") % - {'hits': len(self.hits), 'pages': self.pages}), + formatter.text(_("Hits %(hitsFrom)d to %(hitsTo)d " + "from %(hits)d results out of about %(pages)d pages.") % + {'hits': len(self.hits), 'pages': self.pages, + 'hitsFrom': hitsFrom + 1, + 'hitsTo': hitsFrom + request.cfg.search_results_per_page}), u' (%s)' % formatter.text(_("%.2f seconds") % self.elapsed), formatter.paragraph(0), ] return ''.join(output) - def pageList(self, request, formatter, info=0, numbered=1): + def pageList(self, request, formatter, info=0, numbered=1, + hitsFrom=0): """ Format a list of found pages @param request: current request @param formatter: formatter to use @param info: show match info in title @param numbered: use numbered list for display + @param hitsFrom: current position in the hits @rtype: unicode @return formatted page list """ @@ -298,15 +304,17 @@ class SearchResults: f = formatter write = self.buffer.write if numbered: - list = f.number_list + list = lambda on: f.number_list(on, start=hitsFrom+1) else: list = f.bullet_list # Add pages formatted as list if self.hits: write(list(1)) - - for page in self.hits: + + # XXX: Do some xapian magic here + hitsTo = hitsFrom + request.cfg.search_results_per_page + for page in self.hits[hitsFrom:hitsTo]: if page.attachment: querydict = { 'action': 'AttachFile', @@ -330,11 +338,14 @@ class SearchResults: ] write(''.join(item)) write(list(0)) + write(self.formatPrevNextPageLinks(hitsFrom=hitsFrom, + hitsPerPage=request.cfg.search_results_per_page, + hitsNum=len(self.hits))) return self.getvalue() def pageListWithContext(self, request, formatter, info=1, context=180, - maxlines=1): + maxlines=1, hitsFrom=0): """ Format a list of found pages with context The default parameter values will create Google-like search @@ -345,8 +356,9 @@ class SearchResults: @param request: current request @param formatter: formatter to use @param info: show match info near the page link - @param context: how many characters to show around each match. - @param maxlines: how many contexts lines to show. + @param context: how many characters to show around each match. + @param maxlines: how many contexts lines to show. + @param hitsFrom: current position in the hits @rtype: unicode @return formatted page list with context """ @@ -358,7 +370,9 @@ class SearchResults: if self.hits: write(f.definition_list(1)) - for page in self.hits: + # XXX: Do some xapian magic here + hitsTo = hitsFrom+request.cfg.search_results_per_page + for page in self.hits[hitsFrom:hitsTo]: matchInfo = '' if info: matchInfo = self.formatInfo(f, page) @@ -389,6 +403,9 @@ class SearchResults: ] write(''.join(item)) write(f.definition_list(0)) + write(self.formatPrevNextPageLinks(hitsFrom=hitsFrom, + hitsPerPage=request.cfg.search_results_per_page, + hitsNum=len(self.hits))) return self.getvalue() @@ -596,6 +613,39 @@ class SearchResults: return ''.join(output) return '' + def formatPrevNextPageLinks(self, hitsFrom, hitsPerPage, hitsNum): + """ Format previous and next page links in page + + @param hitsFrom: current position in the hits + @param hitsPerPage: number of hits per page + @param hitsNum: number of hits + @rtype: unicode + @return: links to previous and next pages (if exist) + """ + _ = self.request.getText + f = self.formatter + from_re = r'\&from=[\d]+' + uri = re.sub(from_re, '', self.request.request_uri) + from_uri = lambda n: '%s&from=%i' % (uri, n) + l = [] + if hitsFrom > 0: # previous page available + n = hitsFrom - hitsPerPage + if n < 0: n = 0 + l.append(''.join([ + f.url(1, href=from_uri(n)), + _('Previous Page'), + f.url(0) + ])) + if hitsFrom < hitsNum: # next page available + n = hitsFrom + hitsPerPage + if n >= hitsNum: n = hitsNum - 1 + l.append(''.join([ + f.url(1, href=from_uri(n)), + _('Next Page'), + f.url(0) + ])) + return f.text(' | ').join(l) + def querystring(self, querydict=None): """ Return query string, used in the page link """ if querydict is None: diff -r 3172214eac96 -r 1bdac55acc14 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jul 18 15:36:02 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Jul 18 17:15:02 2006 +0200 @@ -25,8 +25,10 @@ Branch moin/1.6-xapian-fpletz * New config options: xapian_search 0 enables xapian-powered search xapian_index_dir None directory for xapian indices - xapian_stemming True Toggles usage of stemmer, fallback + xapian_stemming True toggles usage of stemmer, fallback to False if no stemmer installed + search_results_per_page 10 determines how many hits should be + shown on a fullsearch action Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -157,4 +159,6 @@ 2006-07-17 2006-07-18 * Fixed some bugs, whitespaces at EOL, better i18n for SystemInfo + * Implemented paging support for searches, needs some style + adjustments, breaks FullSearch macro for now # HG changeset patch # User Franz Pletz # Date 1153244039 -7200 # Node ID 0e352443948b32cb2ec7e15e629870ac5001d761 # Parent f4f7b0c2a9f263c771a0a66b7b0235bef9114e54 make FullSearch macro work again (without paging) diff -r f4f7b0c2a9f2 -r 0e352443948b MoinMoin/macro/FullSearch.py --- a/MoinMoin/macro/FullSearch.py Tue Jul 18 17:15:45 2006 +0200 +++ b/MoinMoin/macro/FullSearch.py Tue Jul 18 19:33:59 2006 +0200 @@ -57,6 +57,6 @@ def execute(macro, needle): results = search.searchPages(request, needle) results.sortByPagename() - return results.pageList(request, macro.formatter) + return results.pageList(request, macro.formatter, paging=False) diff -r f4f7b0c2a9f2 -r 0e352443948b MoinMoin/macro/__init__.py --- a/MoinMoin/macro/__init__.py Tue Jul 18 17:15:45 2006 +0200 +++ b/MoinMoin/macro/__init__.py Tue Jul 18 19:33:59 2006 +0200 @@ -332,7 +332,7 @@ class Macro: results = search.searchPages(self.request, needle, titlesearch=1, case=case) results.sortByPagename() - return results.pageList(self.request, self.formatter) + return results.pageList(self.request, self.formatter, paging=False) def _macro_InterWiki(self, args): from StringIO import StringIO diff -r f4f7b0c2a9f2 -r 0e352443948b MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Jul 18 17:15:45 2006 +0200 +++ b/MoinMoin/search/results.py Tue Jul 18 19:33:59 2006 +0200 @@ -289,13 +289,14 @@ class SearchResults: return ''.join(output) def pageList(self, request, formatter, info=0, numbered=1, - hitsFrom=0): + paging=True, hitsFrom=0): """ Format a list of found pages @param request: current request @param formatter: formatter to use @param info: show match info in title @param numbered: use numbered list for display + @param paging: toggle paging @param hitsFrom: current position in the hits @rtype: unicode @return formatted page list @@ -313,8 +314,13 @@ class SearchResults: write(list(1)) # XXX: Do some xapian magic here - hitsTo = hitsFrom + request.cfg.search_results_per_page - for page in self.hits[hitsFrom:hitsTo]: + if paging: + hitsTo = hitsFrom + request.cfg.search_results_per_page + displayHits = self.hits[hitsFrom:hitsTo] + else: + displayHits = self.hits + + for page in displayHits: if page.attachment: querydict = { 'action': 'AttachFile', @@ -338,14 +344,15 @@ class SearchResults: ] write(''.join(item)) write(list(0)) - write(self.formatPrevNextPageLinks(hitsFrom=hitsFrom, - hitsPerPage=request.cfg.search_results_per_page, - hitsNum=len(self.hits))) + if paging: + write(self.formatPrevNextPageLinks(hitsFrom=hitsFrom, + hitsPerPage=request.cfg.search_results_per_page, + hitsNum=len(self.hits))) return self.getvalue() def pageListWithContext(self, request, formatter, info=1, context=180, - maxlines=1, hitsFrom=0): + maxlines=1, paging=True, hitsFrom=0): """ Format a list of found pages with context The default parameter values will create Google-like search @@ -358,6 +365,7 @@ class SearchResults: @param info: show match info near the page link @param context: how many characters to show around each match. @param maxlines: how many contexts lines to show. + @param paging: toggle paging @param hitsFrom: current position in the hits @rtype: unicode @return formatted page list with context @@ -371,8 +379,13 @@ class SearchResults: write(f.definition_list(1)) # XXX: Do some xapian magic here - hitsTo = hitsFrom+request.cfg.search_results_per_page - for page in self.hits[hitsFrom:hitsTo]: + if paging: + hitsTo = hitsFrom+request.cfg.search_results_per_page + displayHits = self.hits[hitsFrom:hitsTo] + else: + displayHits = self.hits + + for page in displayHits: matchInfo = '' if info: matchInfo = self.formatInfo(f, page) @@ -403,9 +416,10 @@ class SearchResults: ] write(''.join(item)) write(f.definition_list(0)) - write(self.formatPrevNextPageLinks(hitsFrom=hitsFrom, - hitsPerPage=request.cfg.search_results_per_page, - hitsNum=len(self.hits))) + if paging: + write(self.formatPrevNextPageLinks(hitsFrom=hitsFrom, + hitsPerPage=request.cfg.search_results_per_page, + hitsNum=len(self.hits))) return self.getvalue() diff -r f4f7b0c2a9f2 -r 0e352443948b docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jul 18 17:15:45 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Jul 18 19:33:59 2006 +0200 @@ -160,5 +160,5 @@ 2006-07-18 2006-07-18 * Fixed some bugs, whitespaces at EOL, better i18n for SystemInfo * Implemented paging support for searches, needs some style - adjustments, breaks FullSearch macro for now + adjustments # HG changeset patch # User Franz Pletz # Date 1153246186 -7200 # Node ID 1d5fd64c356e087e35bbb2aff69e863af26c4176 # Parent 0e352443948b32cb2ec7e15e629870ac5001d761 fix for showing next page link diff -r 0e352443948b -r 1d5fd64c356e MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Jul 18 19:33:59 2006 +0200 +++ b/MoinMoin/search/results.py Tue Jul 18 20:09:46 2006 +0200 @@ -642,7 +642,7 @@ class SearchResults: uri = re.sub(from_re, '', self.request.request_uri) from_uri = lambda n: '%s&from=%i' % (uri, n) l = [] - if hitsFrom > 0: # previous page available + if hitsFrom > 0: # previous page available n = hitsFrom - hitsPerPage if n < 0: n = 0 l.append(''.join([ @@ -650,7 +650,7 @@ class SearchResults: _('Previous Page'), f.url(0) ])) - if hitsFrom < hitsNum: # next page available + if hitsFrom + hitsPerPage < hitsNum: # next page available n = hitsFrom + hitsPerPage if n >= hitsNum: n = hitsNum - 1 l.append(''.join([ # HG changeset patch # User Thomas Waldmann # Date 1153409202 -7200 # Node ID 7d13ed31c40c7601e3fcd9fa8fcebf9f188423bd # Parent 1d5fd64c356e087e35bbb2aff69e863af26c4176 added missing daily CHANGES entry diff -r 1d5fd64c356e -r 7d13ed31c40c docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jul 18 20:09:46 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jul 20 17:26:42 2006 +0200 @@ -162,3 +162,9 @@ 2006-07-18 * Implemented paging support for searches, needs some style adjustments +2006-07-19 + * student didn't work on the project -- ThomasWaldmann + +2006-07-20 + * ... + # HG changeset patch # User Franz Pletz # Date 1153419644 -7200 # Node ID cb0d00cef7d56a176eb90d89bb0d1f48e4d2f36e # Parent 1d5fd64c356e087e35bbb2aff69e863af26c4176 update my CHANGES diff -r 1d5fd64c356e -r cb0d00cef7d5 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jul 18 20:09:46 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jul 20 20:20:44 2006 +0200 @@ -17,6 +17,10 @@ Branch moin/1.6-xapian-fpletz * Add stemming support for highlighting stuff: 1. regexp for whole word (all lowercase), or 2. just the root of the word + * Subpages: Add positions for complete (!) pagenames into the index + * Case-sensitive searches / Regexp on multiple terms: Graceful + fallback to and/or merge with moinSearch based on nodes xapian can + handle in the search term tree New Features: * Faster search thanks to Xapian # HG changeset patch # User Franz Pletz # Date 1153439107 -7200 # Node ID d40445ea30af1b34190f5f43ec732329e174a5a7 # Parent 7746d74fda4febff6cba7c7f2fbab6ef49d3e03e bugfixes for regexp & link search and file indexing (i.e. due to xapwrap modifications) diff -r 7746d74fda4f -r d40445ea30af MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Thu Jul 20 20:21:31 2006 +0200 +++ b/MoinMoin/search/Xapian.py Fri Jul 21 01:45:07 2006 +0200 @@ -249,7 +249,7 @@ class Index(BaseIndex): mtime = wikiutil.timestamp2version(mtime) if mode == 'update': query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) - docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + enq, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) if docs: doc = docs[0] # there should be only one uid = doc['uid'] @@ -338,7 +338,7 @@ class Index(BaseIndex): # you can just call database.replace_document(uid_term, doc) # -> done in xapwrap.index.Index.index() query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) - docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + enq, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) if docs: doc = docs[0] # there should be only one uid = doc['uid'] @@ -387,7 +387,7 @@ class Index(BaseIndex): mtime = wikiutil.timestamp2version(os.path.getmtime(filename)) if mode == 'update': query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid)) - docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) + enq, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) if debug: request.log("##%r %r" % (filename, docs)) if docs: doc = docs[0] # there should be only one diff -r 7746d74fda4f -r d40445ea30af MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Thu Jul 20 20:21:31 2006 +0200 +++ b/MoinMoin/search/builtin.py Fri Jul 21 01:45:07 2006 +0200 @@ -15,7 +15,7 @@ from MoinMoin.Page import Page from MoinMoin.Page import Page from MoinMoin.util import filesys, lock from MoinMoin.search.results import getSearchResults -from MoinMoin.search.queryparser import TextMatch, TitleMatch +from MoinMoin.search.queryparser import Match, TextMatch, TitleMatch ############################################################################## # Search Engine Abstraction @@ -439,8 +439,13 @@ class Search: len(positions[pos]) < len(term_name): positions[pos] = term_name term.next() - return [self._xapianMatchDecider(term, pos) for pos, term + matches = [self._xapianMatchDecider(term, pos) for pos, term in positions.iteritems()] + + if not matches: + return [Match()] # dummy for metadata, we got a match! + + return matches def _moinSearch(self, pages=None): """ Search pages using moin's built-in full text search diff -r 7746d74fda4f -r d40445ea30af MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Thu Jul 20 20:21:31 2006 +0200 +++ b/MoinMoin/search/queryparser.py Fri Jul 21 01:45:07 2006 +0200 @@ -309,7 +309,7 @@ class TextSearch(BaseExpression): terms = [term for term in allterms() if self.search_re.match(term)] if not terms: - return None + return Query() queries = [Query(Query.OP_OR, terms)] else: analyzer = Xapian.WikiAnalyzer(request=request, @@ -414,9 +414,9 @@ class TitleSearch(BaseExpression): if self.use_re: # basic regex matching per term terms = [term for term in allterms() if - self.search_re.match(term)] + self.search_re.findall(term[1:]) and term[0] == 'S'] if not terms: - return None + return Query() queries = [Query(Query.OP_OR, terms)] else: analyzer = Xapian.WikiAnalyzer(request=request, @@ -540,7 +540,7 @@ class LinkSearch(BaseExpression): continue if not terms: - return None + return Query() return Query(Query.OP_OR, terms) else: return UnicodeQuery('%s:%s' % (prefix, self.pattern)) @@ -601,7 +601,7 @@ class LanguageSearch(BaseExpression): continue if not terms: - return None + return Query() return Query(Query.OP_OR, terms) else: pattern = self.pattern diff -r 7746d74fda4f -r d40445ea30af docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Thu Jul 20 20:21:31 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jul 21 01:45:07 2006 +0200 @@ -4,7 +4,8 @@ Branch moin/1.6-xapian-fpletz Known main issues: * Only term-based regex searching possible, modifier or heuristic to enable usage of _moinSearch for full compatibility? - * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) + * HACK: MoinMoin.search.Xapian.Index._get_languages (wait for proper + metadata) * Positions saved in Xapian aren't always correct, check. Code generally needs some more love. @@ -170,5 +171,5 @@ 2006-07-19 * student didn't work on the project -- ThomasWaldmann 2006-07-20 - * ... + * Fixed some bugs found while testing regexp and case-sensitive searches # HG changeset patch # User Franz Pletz # Date 1153439719 -7200 # Node ID 98b9469ce6ac244824c377a42e56c3f9c1152f17 # Parent d40445ea30af1b34190f5f43ec732329e174a5a7 update CHANGES diff -r d40445ea30af -r 98b9469ce6ac docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Fri Jul 21 01:45:07 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jul 21 01:55:19 2006 +0200 @@ -172,4 +172,7 @@ 2006-07-19 2006-07-20 * Fixed some bugs found while testing regexp and case-sensitive searches + * Conclusion after tinkering with the current code to allow + cooperation between moinSearch and Xapian for case-sensitive + searches (code buried): We probably need a rather big rewrite! # HG changeset patch # User Franz Pletz # Date 1153581881 -7200 # Node ID fc2d00e2bb6b8c8e5f08b5b87bd6f5cd24808875 # Parent 98b9469ce6ac244824c377a42e56c3f9c1152f17 case-sensitive searches work again by using moinSearch for post processing diff -r 98b9469ce6ac -r fc2d00e2bb6b MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Fri Jul 21 01:55:19 2006 +0200 +++ b/MoinMoin/search/builtin.py Sat Jul 22 17:24:41 2006 +0200 @@ -393,7 +393,7 @@ class Search: """ pages = None index = self._xapianIndex(self.request) - if index: #and self.query.xapian_wanted(): + if index and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap @@ -408,8 +408,9 @@ class Search: for k, v in d.items(): d[k] = d[k].decode(config.charset) return d - pages = [{'uid': hit['uid'], 'values': dict_decode(hit['values'])} - for hit in hits] + #pages = [{'uid': hit['uid'], 'values': dict_decode(hit['values'])} + # for hit in hits] + pages = [dict_decode(hit['values']) for hit in hits] self.request.log("xapianSearch: finds pages: %r" % pages) self._xapianEnquire = enq self._xapianIndex = index @@ -418,9 +419,11 @@ class Search: #except AttributeError: # pages = [] self.request.clock.stop('_xapianSearch') - return self._getHits(hits, self._xapianMatch) - else: - return self._moinSearch(pages) + + if not self.query.xapian_need_postproc(): + return self._getHits(hits, self._xapianMatch) + + return self._moinSearch(pages) def _xapianMatchDecider(self, term, pos): if term[0] == 'S': # TitleMatch diff -r 98b9469ce6ac -r fc2d00e2bb6b MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Fri Jul 21 01:55:19 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sat Jul 22 17:24:41 2006 +0200 @@ -177,6 +177,12 @@ class AndExpression(BaseExpression): wanted = wanted and term.xapian_wanted() return wanted + def xapian_need_postproc(self): + for term in self._subterms: + if term.xapian_need_postproc(): + return True + return False + def xapian_term(self, request, allterms): # sort negated terms terms = [] @@ -301,7 +307,11 @@ class TextSearch(BaseExpression): return [] def xapian_wanted(self): + # XXX: Add option for term-based matching return not self.use_re + + def xapian_need_postproc(self): + return self.case def xapian_term(self, request, allterms): if self.use_re: @@ -332,7 +342,7 @@ class TextSearch(BaseExpression): t = [UnicodeQuery(w) for w, pos in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) - if stemmed: + if not self.case and stemmed: self._build_re(' '.join(stemmed), use_re=False, case=self.case, stemmed=True) @@ -383,7 +393,8 @@ class TitleSearch(BaseExpression): for match in self.search_re.finditer(page.page_name): if page.request.cfg.xapian_stemming: # somewhere in regular word - if page.page_name[match.start()] not in config.chars_upper and \ + if not self.case and \ + page.page_name[match.start()] not in config.chars_upper and \ page.page_name[match.start()-1] in config.chars_lower: continue @@ -408,7 +419,10 @@ class TitleSearch(BaseExpression): return [] def xapian_wanted(self): - return not self.use_re + return True # only easy regexps possible + + def xapian_need_postproc(self): + return self.case def xapian_term(self, request, allterms): if self.use_re: @@ -444,7 +458,7 @@ class TitleSearch(BaseExpression): queries.append(Query(Query.OP_AND, t)) - if stemmed: + if not self.case and stemmed: self._build_re(' '.join(stemmed), use_re=False, case=self.case, stemmed=True) @@ -522,7 +536,10 @@ class LinkSearch(BaseExpression): return [] def xapian_wanted(self): - return not self.use_re + return True # only easy regexps possible + + def xapian_need_postproc(self): + return self.case def xapian_term(self, request, allterms): prefix = Xapian.Index.prefixMap['linkto'] @@ -560,7 +577,7 @@ class LanguageSearch(BaseExpression): self._pattern = pattern.lower() self.negated = 0 self.use_re = use_re - self.case = case + self.case = False # not case-sensitive! self.xapian_called = False self._build_re(self._pattern, use_re=use_re, case=case) @@ -582,7 +599,10 @@ class LanguageSearch(BaseExpression): return [Match()] def xapian_wanted(self): - return not self.use_re + return True # only easy regexps possible + + def xapian_need_postproc(self): + return False # case-sensitivity would make no sense def xapian_term(self, request, allterms): self.xapian_called = True diff -r 98b9469ce6ac -r fc2d00e2bb6b docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Fri Jul 21 01:55:19 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Jul 22 17:24:41 2006 +0200 @@ -19,9 +19,15 @@ Branch moin/1.6-xapian-fpletz 1. regexp for whole word (all lowercase), or 2. just the root of the word * Subpages: Add positions for complete (!) pagenames into the index + * Check if permissions/acls are always obeyed + + ToDo (low priority): * Case-sensitive searches / Regexp on multiple terms: Graceful fallback to and/or merge with moinSearch based on nodes xapian can handle in the search term tree + * currently, xapian will fetch relevant pages and feed those into + _moinSearch for doing the real hard stuff it can't handle + -> need for a query optimizer, after SoC? New Features: * Faster search thanks to Xapian @@ -176,3 +182,10 @@ 2006-07-20 cooperation between moinSearch and Xapian for case-sensitive searches (code buried): We probably need a rather big rewrite! +2006-07-21 +2006-07-22 + * Final thoughts: No query optimizer for now. Case-sensitive + sensitive search is done by querying Xapian with the lowercased + terms and run _moinSearch over the relevant pages with the same + query. + # HG changeset patch # User Franz Pletz # Date 1153599071 -7200 # Node ID 5ce3bea2e66c883bd489e4e9938fc80f10e8aef6 # Parent 277b97ba070056f97381e5e24e69f597061dd956 index categories diff -r 277b97ba0700 -r 5ce3bea2e66c MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sat Jul 22 17:56:50 2006 +0200 +++ b/MoinMoin/search/Xapian.py Sat Jul 22 22:11:11 2006 +0200 @@ -170,7 +170,8 @@ class Index(BaseIndex): # the D term, and changing the last digit to a '2' if it's a '3') #X longer prefix for user-defined use 'linkto': 'XLINKTO', # this document links to that document - 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in + 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in + 'category': 'XCAT', # category this document belongs to #Y year (four digits) } @@ -316,6 +317,15 @@ class Index(BaseIndex): # return actual lang and lang to stem in return (lang, default_lang) + def _get_categories(self, page): + body = page.get_raw_body() + + sep = re.search(r'----*\r?\n', body) + if not sep: + return [] + + return re.findall('Category(.*)\r?\n', body[sep.end():]) + def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @arg writer: the index writer object @@ -331,6 +341,7 @@ class Index(BaseIndex): itemid = "%s:%s" % (wikiname, pagename) # XXX: Hack until we get proper metadata language, stem_language = self._get_languages(page) + categories = self._get_categories(page) updated = False if mode == 'update': @@ -362,6 +373,8 @@ class Index(BaseIndex): xapdoc.Keyword('stem_lang', stem_language)] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) + for category in categories: + xkeywords.append(xapdoc.Keyword('category', category)) xcontent = xapdoc.TextField('content', page.get_raw_body()) doc = xapdoc.Document(textFields=(xcontent, xtitle), keywords=xkeywords, diff -r 277b97ba0700 -r 5ce3bea2e66c MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Sat Jul 22 17:56:50 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sat Jul 22 22:11:11 2006 +0200 @@ -19,6 +19,8 @@ try: from MoinMoin.search.Xapian import Query, UnicodeQuery except ImportError: pass + +CATEGORY_RE = re.compile('----\(-\*\)\(\\r\)\?\\n\)\(\.\*\)Category(.*)\\b', re.U) ############################################################################# ### query objects diff -r 277b97ba0700 -r 5ce3bea2e66c docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jul 22 17:56:50 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Jul 22 22:11:11 2006 +0200 @@ -6,8 +6,6 @@ Branch moin/1.6-xapian-fpletz enable usage of _moinSearch for full compatibility? * HACK: MoinMoin.search.Xapian.Index._get_languages (wait for proper metadata) - * Positions saved in Xapian aren't always correct, check. Code - generally needs some more love. ToDo: * Implement the new search UI @@ -19,7 +17,6 @@ Branch moin/1.6-xapian-fpletz 1. regexp for whole word (all lowercase), or 2. just the root of the word * Subpages: Add positions for complete (!) pagenames into the index - * Check if permissions/acls are always obeyed ToDo (low priority): * Case-sensitive searches / Regexp on multiple terms: Graceful @@ -188,4 +185,5 @@ 2006-07-22 sensitive search is done by querying Xapian with the lowercased terms and run _moinSearch over the relevant pages with the same query. + * Indexing of categories # HG changeset patch # User Franz Pletz # Date 1153695377 -7200 # Node ID b953b5ff4877dad58bed240f22f1947640603db2 # Parent 5ce3bea2e66c883bd489e4e9938fc80f10e8aef6 CategorySearch is live diff -r 5ce3bea2e66c -r b953b5ff4877 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sat Jul 22 22:11:11 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 24 00:56:17 2006 +0200 @@ -320,11 +320,17 @@ class Index(BaseIndex): def _get_categories(self, page): body = page.get_raw_body() - sep = re.search(r'----*\r?\n', body) - if not sep: + prev, next = (0, 1) + pos = 0 + while next: + if next != 1: + pos += next.end() + prev, next = next, re.search(r'----*\r?\n', body[pos:]) + + if not prev or prev == 1: return [] - - return re.findall('Category(.*)\r?\n', body[sep.end():]) + + return re.findall(r'Category([^\s]+)', body[pos:]) def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired diff -r 5ce3bea2e66c -r b953b5ff4877 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Sat Jul 22 22:11:11 2006 +0200 +++ b/MoinMoin/search/queryparser.py Mon Jul 24 00:56:17 2006 +0200 @@ -19,8 +19,6 @@ try: from MoinMoin.search.Xapian import Query, UnicodeQuery except ImportError: pass - -CATEGORY_RE = re.compile('----\(-\*\)\(\\r\)\?\\n\)\(\.\*\)Category(.*)\\b', re.U) ############################################################################# ### query objects @@ -274,9 +272,10 @@ class TextSearch(BaseExpression): matches = [] # Search in page name - results = self.titlesearch.search(page) - if results: - matches.extend(results) + if self.titlesearch: + results = self.titlesearch.search(page) + if results: + matches.extend(results) # Search in page body body = page.get_raw_body() @@ -628,6 +627,57 @@ class LanguageSearch(BaseExpression): else: pattern = self.pattern return UnicodeQuery('%s%s' % (prefix, pattern)) + +class CategorySearch(TextSearch): + """ Search the pages belonging to a category """ + + def __init__(self, *args, **kwargs): + TextSearch.__init__(self, *args, **kwargs) + self.titlesearch = None + + def _build_re(self, pattern, **kwargs): + kwargs['use_re'] = True + TextSearch._build_re(self, + r'(----(-*)(\r)?\n)(.*)Category%s\b' % pattern, **kwargs) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def xapian_wanted(self): + return True # only easy regexps possible + + def xapian_need_postproc(self): + return self.case + + def xapian_term(self, request, allterms): + self.xapian_called = True + prefix = Xapian.Index.prefixMap['category'] + if self.use_re: + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n+1:]): + terms.append(term) + elif found: + continue + + if not terms: + return Query() + return Query(Query.OP_OR, terms) + else: + pattern = self._pattern.lower() + return UnicodeQuery('%s:%s' % (prefix, pattern)) ############################################################################## @@ -715,6 +765,7 @@ class QueryParser: case = self.case linkto = False lang = False + category = False for m in modifiers: if "title".startswith(m): @@ -727,8 +778,20 @@ class QueryParser: linkto = True elif "language".startswith(m): lang = True - - if lang: + elif "category".startswith(m): + category = True + + # oh, let's better call xapian if we encouter this nasty regexp ;) + if not category: + cat_re = re.compile(r'----\(-\*\)\(\\r\)\?\\n\)\(\.\*\)Category(.*)\\b', re.U) + cat_match = cat_re.search(text) + if cat_match: + text = cat_match.groups()[0] + category = True + + if category: + obj = CategorySearch(text, use_re=False, case=case) + elif lang: obj = LanguageSearch(text, use_re=regex, case=False) elif linkto: obj = LinkSearch(text, use_re=regex, case=case) diff -r 5ce3bea2e66c -r b953b5ff4877 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jul 22 22:11:11 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jul 24 00:56:17 2006 +0200 @@ -30,6 +30,8 @@ Branch moin/1.6-xapian-fpletz * Faster search thanks to Xapian * Searching for languages with new prefix lang/language, i.e. lang:de Note: Currently only available when Xapian is used + * CategorySearch with prefix category or with the regexp previously + used (autodetected as CategorySearch) * New config options: xapian_search 0 enables xapian-powered search xapian_index_dir None directory for xapian indices @@ -187,3 +189,6 @@ 2006-07-22 query. * Indexing of categories +2006-07-23 + * CategorySearch is live + # HG changeset patch # User Franz Pletz # Date 1153695712 -7200 # Node ID f29d1f51dbfa197807278679f7a7ecb494fa0f2b # Parent b953b5ff4877dad58bed240f22f1947640603db2 categories should be indexed in lowercase diff -r b953b5ff4877 -r f29d1f51dbfa MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Mon Jul 24 00:56:17 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 24 01:01:52 2006 +0200 @@ -330,7 +330,8 @@ class Index(BaseIndex): if not prev or prev == 1: return [] - return re.findall(r'Category([^\s]+)', body[pos:]) + return [cat.lower() + for cat in re.findall(r'Category([^\s]+)', body[pos:])] def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired # HG changeset patch # User Franz Pletz # Date 1153696371 -7200 # Node ID 95aeb7dae0c6113999bd84a3b80cc91e75186302 # Parent f29d1f51dbfa197807278679f7a7ecb494fa0f2b update my CHANGES: some TODOs solved diff -r f29d1f51dbfa -r 95aeb7dae0c6 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Mon Jul 24 01:01:52 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jul 24 01:12:51 2006 +0200 @@ -10,13 +10,11 @@ Branch moin/1.6-xapian-fpletz ToDo: * Implement the new search UI * Write/update documentation for all the new search stuff - * Indexing and searching of categories (new term prefix) * Reevaluate Xapwrap, possibly drop it and rip out usable stuff (i.e. ExceptionTranslator) * Add stemming support for highlighting stuff: 1. regexp for whole word (all lowercase), or 2. just the root of the word - * Subpages: Add positions for complete (!) pagenames into the index ToDo (low priority): * Case-sensitive searches / Regexp on multiple terms: Graceful @@ -191,4 +189,9 @@ 2006-07-22 2006-07-23 * CategorySearch is live + * Subpage issue does not need changes: Can be done with regex magic + I.e.: - subpages of MyPage: re:^MyPage/ + - subpages called SubPage: re:/SubPage + - subpages called Subpage (1st level): re:[^/]*/SubPage + - subpages called Subpage (last level): re:/Subpage$ # HG changeset patch # User Franz Pletz # Date 1153696758 -7200 # Node ID 10512e7ca243378d3be85865a62e6ee6b84d43e8 # Parent 95aeb7dae0c6113999bd84a3b80cc91e75186302 we should allow regex with category search prefix diff -r 95aeb7dae0c6 -r 10512e7ca243 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Mon Jul 24 01:12:51 2006 +0200 +++ b/MoinMoin/search/queryparser.py Mon Jul 24 01:19:18 2006 +0200 @@ -788,9 +788,10 @@ class QueryParser: if cat_match: text = cat_match.groups()[0] category = True + regex = False if category: - obj = CategorySearch(text, use_re=False, case=case) + obj = CategorySearch(text, use_re=regex, case=case) elif lang: obj = LanguageSearch(text, use_re=regex, case=False) elif linkto: # HG changeset patch # User Franz Pletz # Date 1153743535 -7200 # Node ID 73f576c4bca35417fd9108c719c3985cf9442089 # Parent 9085983fc624dcdf312f09abfee25e4ae09a7f7d fix multiconfig merge and more informative SystemInfo macro diff -r 9085983fc624 -r 73f576c4bca3 MoinMoin/config/multiconfig.py --- a/MoinMoin/config/multiconfig.py Mon Jul 24 11:52:07 2006 +0200 +++ b/MoinMoin/config/multiconfig.py Mon Jul 24 14:18:55 2006 +0200 @@ -295,6 +295,7 @@ reStructuredText Quick Reference xapian_search = False # disabled until xapian is finished xapian_index_dir = None xapian_stemming = True + search_results_per_page = 10 mail_login = None # or "user pwd" if you need to use SMTP AUTH mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail diff -r 9085983fc624 -r 73f576c4bca3 MoinMoin/macro/SystemInfo.py --- a/MoinMoin/macro/SystemInfo.py Mon Jul 24 11:52:07 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Mon Jul 24 14:18:55 2006 +0200 @@ -17,6 +17,7 @@ from MoinMoin import action, macro, pars from MoinMoin import action, macro, parser from MoinMoin.logfile import editlog, eventlog from MoinMoin.Page import Page +from MoinMoin.util import timefuncs def execute(Macro, args): """ show SystemInfo: wiki infos, wiki sw version, space usage infos """ @@ -111,11 +112,15 @@ def execute(Macro, args): row(_('Local extension parsers'), ', '.join(wikiutil.wikiPlugins('parser', Macro.cfg)) or nonestr) - state = (_('Disabled'), _('Enabled')) + from MoinMoin.search.builtin import Search + xapState = (_('Disabled'), _('Enabled')) idxState = (_('index available'), _('index unavailable')) - from MoinMoin.search.builtin import Search - row(_('Xapian search'), '%s, %s' % (state[request.cfg.xapian_search], - Search._xapianIndex(request) and idxState[0] or idxState[1])) + idx = Search._xapianIndex(request) + available = idx and idxState[0] or idxState[1] + mtime = _('last modified: %s') % (idx and + timefuncs.formathttpdate(idx.mtime()) or _('unavailable')) + row(_('Xapian search'), '%s, %s, %s' + % (xapState[request.cfg.xapian_search], available, mtime)) row(_('Active threads'), t_count or 'N/A') buf.write(u'') diff -r 9085983fc624 -r 73f576c4bca3 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Mon Jul 24 11:52:07 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 24 14:18:55 2006 +0200 @@ -214,6 +214,7 @@ class Index(BaseIndex): def _do_queued_updates(self, request, amount=5): """ Assumes that the write lock is acquired """ + self.touch() writer = xapidx.Index(self.dir, True) writer.configure(self.prefixMap, self.indexValueMap) pages = self.queue.pages()[:amount] @@ -466,6 +467,7 @@ class Index(BaseIndex): mode = 'add' try: + self.touch() writer = xapidx.Index(self.dir, True) writer.configure(self.prefixMap, self.indexValueMap) pages = request.rootpage.getPageList(user='', exists=1) diff -r 9085983fc624 -r 73f576c4bca3 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Mon Jul 24 11:52:07 2006 +0200 +++ b/MoinMoin/search/builtin.py Mon Jul 24 14:18:55 2006 +0200 @@ -167,6 +167,9 @@ class BaseIndex: def mtime(self): return os.path.getmtime(self.dir) + + def touch(self): + os.utime(self.dir) def _search(self, query): raise NotImplemented # HG changeset patch # User Franz Pletz # Date 1153750949 -7200 # Node ID a9ffe6479012083147935a4b84959607fc871cfa # Parent 73f576c4bca35417fd9108c719c3985cf9442089 nicer regexp support for TitleSearch diff -r 73f576c4bca3 -r a9ffe6479012 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Mon Jul 24 14:18:55 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 24 16:22:29 2006 +0200 @@ -172,6 +172,7 @@ class Index(BaseIndex): 'linkto': 'XLINKTO', # this document links to that document 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in 'category': 'XCAT', # category this document belongs to + 'full_title': 'XFT', # full title (for regex) #Y year (four digits) } @@ -378,7 +379,8 @@ class Index(BaseIndex): xtitle = xapdoc.TextField('title', pagename, True) # prefixed xkeywords = [xapdoc.Keyword('itemid', itemid), xapdoc.Keyword('lang', language), - xapdoc.Keyword('stem_lang', stem_language)] + xapdoc.Keyword('stem_lang', stem_language), + xapdoc.Keyword('full_title', pagename.lower())] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) for category in categories: diff -r 73f576c4bca3 -r a9ffe6479012 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Mon Jul 24 14:18:55 2006 +0200 +++ b/MoinMoin/search/builtin.py Mon Jul 24 16:22:29 2006 +0200 @@ -169,7 +169,7 @@ class BaseIndex: return os.path.getmtime(self.dir) def touch(self): - os.utime(self.dir) + os.utime(self.dir, None) def _search(self, query): raise NotImplemented diff -r 73f576c4bca3 -r a9ffe6479012 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Mon Jul 24 14:18:55 2006 +0200 +++ b/MoinMoin/search/queryparser.py Mon Jul 24 16:22:29 2006 +0200 @@ -428,8 +428,15 @@ class TitleSearch(BaseExpression): def xapian_term(self, request, allterms): if self.use_re: # basic regex matching per term - terms = [term for term in allterms() if - self.search_re.findall(term[1:]) and term[0] == 'S'] + terms = [] + found = False + for term in allterms(): + if term[:4] == 'XFT:': + found = True + if self.search_re.findall(term[4:]): + terms.append(term) + elif found: + break if not terms: return Query() queries = [Query(Query.OP_OR, terms)] diff -r 73f576c4bca3 -r a9ffe6479012 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Mon Jul 24 14:18:55 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jul 24 16:22:29 2006 +0200 @@ -195,3 +195,7 @@ 2006-07-23 - subpages called Subpage (1st level): re:[^/]*/SubPage - subpages called Subpage (last level): re:/Subpage$ +2006-07-24 + * SystemInfo macro update (mtime) + * nicer regexp support for TitleSearch + # HG changeset patch # User Thomas Waldmann # Date 1153856874 -7200 # Node ID ccfe3e70f4f258a0056c87b48e70fa09cbfb4ea0 # Parent 277b97ba070056f97381e5e24e69f597061dd956 added missing daily CHANGES entries diff -r 277b97ba0700 -r ccfe3e70f4f2 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jul 22 17:56:50 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Jul 25 21:47:54 2006 +0200 @@ -189,3 +189,10 @@ 2006-07-22 terms and run _moinSearch over the relevant pages with the same query. +2006-07-23 + * student didn't work on the project -- ThomasWaldmann + +2006-07-24 + * the daily entry is missing here -- ThomasWaldmann + + # HG changeset patch # User Thomas Waldmann # Date 1154207995 -7200 # Node ID 017deaab4afdff4181c7b1109776b76d87b23e08 # Parent a1c47a57fd01c363802af4da899fdc0985d9c761 added missing daily entries diff -r a1c47a57fd01 -r 017deaab4afd docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Jul 25 21:49:35 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Jul 29 23:19:55 2006 +0200 @@ -199,3 +199,6 @@ 2006-07-24 * SystemInfo macro update (mtime) * nicer regexp support for TitleSearch +2006-07-25 .. 2006-07-29 + * student did not work on project + # HG changeset patch # User Franz Pletz # Date 1154531184 -7200 # Node ID d028d37e710550507be4ea316cd4378a4b9a6306 # Parent a31940162a32617a0f740e98800f0204f2ffab4f raise NotImplemented instance diff -r a31940162a32 -r d028d37e7105 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Mon Jul 31 12:24:50 2006 +0200 +++ b/MoinMoin/search/builtin.py Wed Aug 02 17:06:24 2006 +0200 @@ -159,7 +159,7 @@ class BaseIndex: ## self.indexPagesInNewThread(request) def _main_dir(self): - raise NotImplemented + raise NotImplemented('...') def exists(self): """ Check if index exists """ @@ -172,7 +172,7 @@ class BaseIndex: os.utime(self.dir, None) def _search(self, query): - raise NotImplemented + raise NotImplemented('...') def search(self, query): #if not self.read_lock.acquire(1.0): @@ -243,7 +243,7 @@ class BaseIndex: When called in a new thread, lock is acquired before the call, and this method must release it when it finishes or fails. """ - raise NotImplemented + raise NotImplemented('...') def _do_queued_updates_InNewThread(self): """ do queued index updates in a new thread @@ -283,10 +283,10 @@ class BaseIndex: raise def _do_queued_updates(self, request, amount=5): - raise NotImplemented + raise NotImplemented('...') def optimize(self): - raise NotImplemented + raise NotImplemented('...') def contentfilter(self, filename): """ Get a filter for content of filename and return unicode content. """ @@ -311,7 +311,7 @@ class BaseIndex: return mt.mime_type(), data def test(self, request): - raise NotImplemented + raise NotImplemented('...') def _indexingRequest(self, request): """ Return a new request that can be used for index building. # HG changeset patch # User Franz Pletz # Date 1154533649 -7200 # Node ID 20908b1eccb2ea3b23ab8b51fcd02e82541eaaa6 # Parent 84f94820d612af73f30e1a9e35a714e2f494552c small whitespace fixes diff -r 84f94820d612 -r 20908b1eccb2 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Wed Aug 02 17:07:09 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Wed Aug 02 17:47:29 2006 +0200 @@ -98,8 +98,8 @@ def execute(pagename, request, fieldname # Then search results info = not titlesearch if context: - output = results.pageListWithContext(request, request.formatter, info=info, - context=context, hitsFrom=hitsFrom) + output = results.pageListWithContext(request, request.formatter, + info=info, context=context, hitsFrom=hitsFrom) else: output = results.pageList(request, request.formatter, info=info, hitsFrom=hitsFrom) diff -r 84f94820d612 -r 20908b1eccb2 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Wed Aug 02 17:07:09 2006 +0200 +++ b/MoinMoin/search/results.py Wed Aug 02 17:47:29 2006 +0200 @@ -283,7 +283,8 @@ class SearchResults: {'hits': len(self.hits), 'pages': self.pages, 'hitsFrom': hitsFrom + 1, 'hitsTo': hitsFrom + request.cfg.search_results_per_page}), - u' (%s)' % formatter.text(_("%.2f seconds") % self.elapsed), + formatter.text(u' (%s)' % + formatter.text(_("%.2f seconds") % self.elapsed)), formatter.paragraph(0), ] return ''.join(output) diff -r 84f94820d612 -r 20908b1eccb2 wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Wed Aug 02 17:07:09 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Wed Aug 02 17:47:29 2006 +0200 @@ -334,12 +334,14 @@ div.codearea pre span.DiffSeparator {col .searchresults dt { margin-top: 1em; - font-weight: normal; + font-weight: normal; } .searchresults dd { - font-size: 0.85em; -} + font-size: 0.85em; +} + + /* MonthCalendar css */ # HG changeset patch # User Franz Pletz # Date 1154537615 -7200 # Node ID 53a1bd19196e35bad380dfcad4acd2a27d981247 # Parent 20908b1eccb2ea3b23ab8b51fcd02e82541eaaa6 css'ed searchstats diff -r 20908b1eccb2 -r 53a1bd19196e MoinMoin/search/results.py --- a/MoinMoin/search/results.py Wed Aug 02 17:47:29 2006 +0200 +++ b/MoinMoin/search/results.py Wed Aug 02 18:53:35 2006 +0200 @@ -277,14 +277,18 @@ class SearchResults: """ _ = request.getText output = [ - formatter.paragraph(1), - formatter.text(_("Hits %(hitsFrom)d to %(hitsTo)d " - "from %(hits)d results out of about %(pages)d pages.") % + formatter.paragraph(1, attr={'class': 'searchstats'}), + _("Results %(bs)s%(hitsFrom)d -%(hitsTo)d%(be)s " + "of about %(bs)s%(hits)d%(be)s results out of about " + "%(pages)d pages.") % {'hits': len(self.hits), 'pages': self.pages, - 'hitsFrom': hitsFrom + 1, - 'hitsTo': hitsFrom + request.cfg.search_results_per_page}), - formatter.text(u' (%s)' % - formatter.text(_("%.2f seconds") % self.elapsed)), + 'hitsFrom': hitsFrom + 1, + 'hitsTo': hitsFrom + request.cfg.search_results_per_page, + 'bs': formatter.strong(1), 'be': formatter.strong(0)}, + u' (%s %s)' % (''.join([formatter.strong(1), + formatter.text("%.2f" % self.elapsed), + formatter.strong(0)]), + formatter.text(_("seconds"))), formatter.paragraph(0), ] return ''.join(output) diff -r 20908b1eccb2 -r 53a1bd19196e wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Wed Aug 02 17:47:29 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Wed Aug 02 18:53:35 2006 +0200 @@ -341,7 +341,14 @@ div.codearea pre span.DiffSeparator {col font-size: 0.85em; } - +p.searchstats { + font-size: 0.8em; + text-align: right; + width: 100%; + background-color: #E6EAF0; + border-top: 1px solid #9088DC; + padding: 2px; +} /* MonthCalendar css */ # HG changeset patch # User Franz Pletz # Date 1154542505 -7200 # Node ID a8fb67cb953b12b4a53b8b8e1c446344c1f4768e # Parent 53a1bd19196e35bad380dfcad4acd2a27d981247 search info bar added, misc i18n fixes diff -r 53a1bd19196e -r a8fb67cb953b MoinMoin/macro/SystemInfo.py --- a/MoinMoin/macro/SystemInfo.py Wed Aug 02 18:53:35 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Wed Aug 02 20:15:05 2006 +0200 @@ -17,7 +17,6 @@ from MoinMoin import action, macro, pars from MoinMoin import action, macro, parser from MoinMoin.logfile import editlog, eventlog from MoinMoin.Page import Page -from MoinMoin.util import timefuncs def execute(Macro, args): """ show SystemInfo: wiki infos, wiki sw version, space usage infos """ @@ -118,11 +117,13 @@ def execute(Macro, args): idx = Search._xapianIndex(request) available = idx and idxState[0] or idxState[1] mtime = _('last modified: %s') % (idx and - timefuncs.formathttpdate(idx.mtime()) or _('unavailable')) + request.user.getFormattedDateTime( + wikiutil.version2timestamp(idx.mtime())) or + _('N/A')) row(_('Xapian search'), '%s, %s, %s' % (xapState[request.cfg.xapian_search], available, mtime)) - row(_('Active threads'), t_count or 'N/A') + row(_('Active threads'), t_count or _('N/A')) buf.write(u'') return Macro.formatter.rawHTML(buf.getvalue()) diff -r 53a1bd19196e -r a8fb67cb953b MoinMoin/search/results.py --- a/MoinMoin/search/results.py Wed Aug 02 18:53:35 2006 +0200 +++ b/MoinMoin/search/results.py Wed Aug 02 20:15:05 2006 +0200 @@ -378,6 +378,7 @@ class SearchResults: self._reset(request, formatter) f = formatter write = self.buffer.write + _ = request.getText # Add pages formatted as definition list if self.hits: @@ -418,6 +419,17 @@ class SearchResults: f.definition_desc(1), fmt_context, f.definition_desc(0), + f.definition_desc(1, attr={'class': 'searchresinfobar'}), + f.text('%.1fk - ' % (page.page.size()/1024.0)), + f.text('rev: %d %s- ' % (page.page.get_real_rev(), + not page.page.rev and '(%s) ' % _('current') or '')), + f.text('last modified: %(time)s - ' % page.page.lastEditInfo()), + # XXX: proper metadata + #f.text('lang: %s - ' % page.page.language), + f.url(1, href='#'), + f.text(_('Similar pages')), + f.url(0), + f.definition_desc(0), ] write(''.join(item)) write(f.definition_list(0)) @@ -652,7 +664,7 @@ class SearchResults: if n < 0: n = 0 l.append(''.join([ f.url(1, href=from_uri(n)), - _('Previous Page'), + f.text(_('Previous Page')), f.url(0) ])) if hitsFrom + hitsPerPage < hitsNum: # next page available @@ -660,7 +672,7 @@ class SearchResults: if n >= hitsNum: n = hitsNum - 1 l.append(''.join([ f.url(1, href=from_uri(n)), - _('Next Page'), + f.text(_('Next Page')), f.url(0) ])) return f.text(' | ').join(l) diff -r 53a1bd19196e -r a8fb67cb953b wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Wed Aug 02 18:53:35 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Wed Aug 02 20:15:05 2006 +0200 @@ -341,6 +341,11 @@ div.codearea pre span.DiffSeparator {col font-size: 0.85em; } +.searchresults dd.searchresinfobar { + color: #008000; + margin-left: 15px; +} + p.searchstats { font-size: 0.8em; text-align: right; # HG changeset patch # User Franz Pletz # Date 1154542852 -7200 # Node ID 4c3b141bda6b95bb7da810fe145c0c9ea0188a42 # Parent a8fb67cb953b12b4a53b8b8e1c446344c1f4768e update my CHANGES diff -r a8fb67cb953b -r 4c3b141bda6b docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Wed Aug 02 20:15:05 2006 +0200 +++ b/docs/CHANGES.fpletz Wed Aug 02 20:20:52 2006 +0200 @@ -199,6 +199,12 @@ 2006-07-24 * SystemInfo macro update (mtime) * nicer regexp support for TitleSearch -2006-07-25 .. 2006-07-29 +2006-07-25 .. 2006-07-30 * student did not work on project +2006-08-01 .. 2006-07-02 + * Reformatted search statistics to use CSS and be more google-like + (only in modern theme for now) + * Added "search result info bar", showing revision, size, mtime, + links for further searches (-> ToDo) etc. + diff -r a8fb67cb953b -r 4c3b141bda6b wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Wed Aug 02 20:15:05 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Wed Aug 02 20:20:52 2006 +0200 @@ -351,7 +351,7 @@ p.searchstats { text-align: right; width: 100%; background-color: #E6EAF0; - border-top: 1px solid #9088DC; + border-top: 1px solid #9088DC; padding: 2px; } # HG changeset patch # User Franz Pletz # Date 1154544986 -7200 # Node ID 237ca54182a7dc5004c317a0fd354b630b034446 # Parent 4c3b141bda6b95bb7da810fe145c0c9ea0188a42 copied code for search box over to macro/FullSearch.py diff -r 4c3b141bda6b -r 237ca54182a7 MoinMoin/macro/FullSearch.py --- a/MoinMoin/macro/FullSearch.py Wed Aug 02 20:20:52 2006 +0200 +++ b/MoinMoin/macro/FullSearch.py Wed Aug 02 20:56:26 2006 +0200 @@ -32,13 +32,63 @@ from MoinMoin import config, wikiutil, s Dependencies = ["pages"] + +def search_box(type, macro): + """ Make a search box + + Make both Title Search and Full Search boxes, according to type. + + @param type: search box type: 'titlesearch' or 'fullsearch' + @rtype: unicode + @return: search box html fragment + """ + _ = macro._ + if macro.form.has_key('value'): + default = wikiutil.escape(macro.form["value"][0], quote=1) + else: + default = '' + + # Title search settings + boxes = '' + button = _("Search Titles") + + # Special code for fullsearch + if type == "fullsearch": + boxes = [ + u'
', + u'', + _('Display context of search results'), + u'
', + u'', + _('Case-sensitive searching'), + ] + boxes = u'\n'.join(boxes) + button = _("Search Text") + + # Format + type = (type == "titlesearch") + html = [ + u'
', + u'
', + u'', + u'' % type, + u'' % default, + u'' % button, + boxes, + u'
', + u'
', + ] + html = u'\n'.join(html) + return macro.formatter.rawHTML(html) + + def execute(macro, needle): request = macro.request _ = request.getText # if no args given, invoke "classic" behavior if needle is None: - return macro._m_search("fullsearch") + return search_box("fullsearch", macro) # With empty arguments, simulate title click (backlinks to page) elif needle == '': diff -r 4c3b141bda6b -r 237ca54182a7 MoinMoin/macro/__init__.py --- a/MoinMoin/macro/__init__.py Wed Aug 02 20:20:52 2006 +0200 +++ b/MoinMoin/macro/__init__.py Wed Aug 02 20:56:26 2006 +0200 @@ -145,55 +145,8 @@ class Macro: return self.defaultDependency def _macro_TitleSearch(self, args): - return self._m_search("titlesearch") - - def _m_search(self, type): - """ Make a search box - - Make both Title Search and Full Search boxes, according to type. - - @param type: search box type: 'titlesearch' or 'fullsearch' - @rtype: unicode - @return: search box html fragment - """ - _ = self._ - if self.form.has_key('value'): - default = wikiutil.escape(self.form["value"][0], quote=1) - else: - default = '' - - # Title search settings - boxes = '' - button = _("Search Titles") - - # Special code for fullsearch - if type == "fullsearch": - boxes = [ - u'
', - u'', - _('Display context of search results'), - u'
', - u'', - _('Case-sensitive searching'), - ] - boxes = u'\n'.join(boxes) - button = _("Search Text") - - # Format - type = (type == "titlesearch") - html = [ - u'
', - u'
', - u'', - u'' % type, - u'' % default, - u'' % button, - boxes, - u'
', - u'
', - ] - html = u'\n'.join(html) - return self.formatter.rawHTML(html) + from FullSearch import search_box + return search_box("titlesearch", self) def _macro_GoTo(self, args): """ Make a goto box # HG changeset patch # User Franz Pletz # Date 1154860960 -7200 # Node ID d56eeab4e0702a348528923a0138e4be3b162f65 # Parent b3c2d87024c3db3554646a84d4f954403351927f google-like paging diff -r b3c2d87024c3 -r d56eeab4e070 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sat Aug 05 20:24:25 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 12:42:40 2006 +0200 @@ -278,7 +278,7 @@ class SearchResults: _ = request.getText output = [ formatter.paragraph(1, attr={'class': 'searchstats'}), - _("Results %(bs)s%(hitsFrom)d -%(hitsTo)d%(be)s " + _("Results %(bs)s%(hitsFrom)d - %(hitsTo)d%(be)s " "of about %(bs)s%(hits)d%(be)s results out of about " "%(pages)d pages.") % {'hits': len(self.hits), 'pages': self.pages, @@ -644,6 +644,12 @@ class SearchResults: return ''.join(output) return '' + def _img_url(self, img): + cfg = self.request.cfg + # XXX: proper gfx + #return '%s/%s/img/%s' % (cfg.url_prefix, cfg.theme_default, img) + return 'http://www.google.com/intl/en/%s' % img + def formatPrevNextPageLinks(self, hitsFrom, hitsPerPage, hitsNum): """ Format previous and next page links in page @@ -655,27 +661,86 @@ class SearchResults: """ _ = self.request.getText f = self.formatter + + # url magic from_re = r'\&from=[\d]+' uri = re.sub(from_re, '', self.request.request_uri) - from_uri = lambda n: '%s&from=%i' % (uri, n) + page_url = lambda n: '%s&from=%i' % (uri, n * hitsPerPage) + + pages = float(hitsNum) / hitsPerPage + if pages - int(pages) > 0.0: + pages = int(pages) + 2 + cur_page = hitsFrom / hitsPerPage l = [] - if hitsFrom > 0: # previous page available - n = hitsFrom - hitsPerPage - if n < 0: n = 0 + + # previous page available + if cur_page > 0: l.append(''.join([ - f.url(1, href=from_uri(n)), - f.text(_('Previous Page')), + f.url(1, href=page_url(cur_page-1)), + f.text(_('Previous')), f.url(0) ])) - if hitsFrom + hitsPerPage < hitsNum: # next page available - n = hitsFrom + hitsPerPage - if n >= hitsNum: n = hitsNum - 1 + else: + l.append('') + + # list of pages to be shown + page_range = range(*( + cur_page - 4 < 0 and + (0, pages >= 10 and 10 or pages-1) + or + (cur_page - 4, cur_page + 5 > pages and + cur_page + (pages - 1 - cur_page) or + cur_page + 6))) + l.extend([''.join([ + f.url(1, href=page_url(i)), + f.text(str(i+1)), + f.url(0), + ]) for i in page_range]) + + # next page available + if cur_page < pages: l.append(''.join([ - f.url(1, href=from_uri(n)), - f.text(_('Next Page')), + f.url(1, href=page_url(cur_page+1)), + f.text(_('Next')), f.url(0) ])) - return f.text(' | ').join(l) + else: + l.append('') + + return ''.join([ + f.table(1, attrs={'tableclass': 'searchpages'}), + f.table_row(1), + f.table_cell(1), + # first image, previous page + l[0] and + f.image(self._img_url('nav_previous.gif')) or + f.image(self._img_url('nav_first.gif')), + f.table_cell(0), + # images for ooos, highlighted current page + ''.join([ + ''.join([ + f.table_cell(1), + f.url(1, href=page_url(i)), + f.image(self._img_url(i == cur_page and + 'nav_current.gif' or 'nav_page.gif')), + f.url(0), + f.table_cell(0), + ]) for i in page_range + ]), + f.table_cell(1), + # last image, next page + l[-1] and f.image(self._img_url('nav_next.gif')) or + f.image(self._img_url('nav_last.gif')), + f.table_cell(0), + f.table_row(0), + f.table_row(1), + f.table_cell(1), + # textlinks + (f.table_cell(0) + f.table_cell(1)).join(l), + f.table_cell(0), + f.table_row(0), + f.table(0), + ]) def querystring(self, querydict=None): """ Return query string, used in the page link """ diff -r b3c2d87024c3 -r d56eeab4e070 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Aug 05 20:24:25 2006 +0200 +++ b/docs/CHANGES.fpletz Sun Aug 06 12:42:40 2006 +0200 @@ -202,9 +202,13 @@ 2006-07-25 .. 2006-07-30 2006-07-25 .. 2006-07-30 * student did not work on project -2006-08-01 .. 2006-07-02 +2006-08-01 .. 2006-08-02 * Reformatted search statistics to use CSS and be more google-like (only in modern theme for now) * Added "search result info bar", showing revision, size, mtime, links for further searches (-> ToDo) etc. +2006-08-05 .. 2006-08-06 + * (finally :)) Google-like paging, using images from google.com until + we get proper moin gfx + diff -r b3c2d87024c3 -r d56eeab4e070 wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Sat Aug 05 20:24:25 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Sun Aug 06 12:42:40 2006 +0200 @@ -355,6 +355,19 @@ p.searchstats { padding: 2px; } +.searchpages { + margin-left: auto; + margin-right: auto; +} + +.searchpages tr, .searchpages td { + border: 0; + padding: 0; + margin: 0; + text-align: center; + vertical-align: middle; +} + /* MonthCalendar css */ /* days without and with pages linked to them */ # HG changeset patch # User Franz Pletz # Date 1154862602 -7200 # Node ID c85f3cf602c982f50bac43b4f6b445a434d8d4e7 # Parent d56eeab4e0702a348528923a0138e4be3b162f65 small fixes for pagelinks diff -r d56eeab4e070 -r c85f3cf602c9 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sun Aug 06 12:42:40 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 13:10:02 2006 +0200 @@ -647,7 +647,7 @@ class SearchResults: def _img_url(self, img): cfg = self.request.cfg # XXX: proper gfx - #return '%s/%s/img/%s' % (cfg.url_prefix, cfg.theme_default, img) + #return '%s/%s/img/%s' % (cfg.url_prefix, cfg.theme.name, img) return 'http://www.google.com/intl/en/%s' % img def formatPrevNextPageLinks(self, hitsFrom, hitsPerPage, hitsNum): @@ -692,9 +692,9 @@ class SearchResults: cur_page + (pages - 1 - cur_page) or cur_page + 6))) l.extend([''.join([ - f.url(1, href=page_url(i)), + i != cur_page and f.url(1, href=page_url(i)) or '', f.text(str(i+1)), - f.url(0), + i != cur_page and f.url(0) or '', ]) for i in page_range]) # next page available @@ -720,10 +720,10 @@ class SearchResults: ''.join([ ''.join([ f.table_cell(1), - f.url(1, href=page_url(i)), + i != cur_page and f.url(1, href=page_url(i)) or '', f.image(self._img_url(i == cur_page and 'nav_current.gif' or 'nav_page.gif')), - f.url(0), + i != cur_page and f.url(0) or '', f.table_cell(0), ]) for i in page_range ]), diff -r d56eeab4e070 -r c85f3cf602c9 wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Sun Aug 06 12:42:40 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Sun Aug 06 13:10:02 2006 +0200 @@ -366,6 +366,14 @@ p.searchstats { margin: 0; text-align: center; vertical-align: middle; + color: #a90a08; + font-weight: bold; +} + +.searchpages td a, .searchpages td a:link { + color: #000000; + text-decoration: underline; + font-weight: normal; } /* MonthCalendar css */ # HG changeset patch # User Franz Pletz # Date 1154865294 -7200 # Node ID 2f98fbe3a52727e5a444dec79f99b59dd43fe800 # Parent c85f3cf602c982f50bac43b4f6b445a434d8d4e7 fixes for google paging diff -r c85f3cf602c9 -r 2f98fbe3a527 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sun Aug 06 13:10:02 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 13:54:54 2006 +0200 @@ -688,7 +688,7 @@ class SearchResults: cur_page - 4 < 0 and (0, pages >= 10 and 10 or pages-1) or - (cur_page - 4, cur_page + 5 > pages and + (cur_page - 4, cur_page + 6 > pages and cur_page + (pages - 1 - cur_page) or cur_page + 6))) l.extend([''.join([ @@ -713,7 +713,11 @@ class SearchResults: f.table_cell(1), # first image, previous page l[0] and - f.image(self._img_url('nav_previous.gif')) or + ''.join([ + f.url(1, href=page_url(cur_page-1)), + f.image(self._img_url('nav_previous.gif')), + f.url(0), + ]) or f.image(self._img_url('nav_first.gif')), f.table_cell(0), # images for ooos, highlighted current page @@ -729,7 +733,12 @@ class SearchResults: ]), f.table_cell(1), # last image, next page - l[-1] and f.image(self._img_url('nav_next.gif')) or + l[-1] and + ''.join([ + f.url(1, href=page_url(cur_page+1)), + f.image(self._img_url('nav_next.gif')), + f.url(0), + ]) or f.image(self._img_url('nav_last.gif')), f.table_cell(0), f.table_row(0), # HG changeset patch # User Franz Pletz # Date 1154865500 -7200 # Node ID b2bed51a045da3949f32f92b4b61da0e513834e0 # Parent 2f98fbe3a52727e5a444dec79f99b59dd43fe800 final fix ;) diff -r 2f98fbe3a527 -r b2bed51a045d MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sun Aug 06 13:54:54 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 13:58:20 2006 +0200 @@ -686,9 +686,9 @@ class SearchResults: # list of pages to be shown page_range = range(*( cur_page - 4 < 0 and - (0, pages >= 10 and 10 or pages-1) + (0, pages >= 10 and 10 or pages) or - (cur_page - 4, cur_page + 6 > pages and + (cur_page - 4, cur_page + 5 > pages and cur_page + (pages - 1 - cur_page) or cur_page + 6))) l.extend([''.join([ # HG changeset patch # User Franz Pletz # Date 1154865618 -7200 # Node ID d0e72768f936bd4f4d8ca8eb27f7cc5516577162 # Parent b2bed51a045da3949f32f92b4b61da0e513834e0 debugging over xapian.wikiwikiweb.de isn't fun.. last one ;) diff -r b2bed51a045d -r d0e72768f936 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sun Aug 06 13:58:20 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 14:00:18 2006 +0200 @@ -688,7 +688,7 @@ class SearchResults: cur_page - 4 < 0 and (0, pages >= 10 and 10 or pages) or - (cur_page - 4, cur_page + 5 > pages and + (cur_page - 4, cur_page + 6 > pages and cur_page + (pages - 1 - cur_page) or cur_page + 6))) l.extend([''.join([ # HG changeset patch # User Franz Pletz # Date 1154866346 -7200 # Node ID 64c02d0697d5d4aeb0c4faf50cf81e1f3b93e99c # Parent d0e72768f936bd4f4d8ca8eb27f7cc5516577162 there were some serious flaws in continued paging.. works now ;) diff -r d0e72768f936 -r 64c02d0697d5 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sun Aug 06 14:00:18 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 14:12:26 2006 +0200 @@ -669,7 +669,7 @@ class SearchResults: pages = float(hitsNum) / hitsPerPage if pages - int(pages) > 0.0: - pages = int(pages) + 2 + pages = int(pages) + 1 cur_page = hitsFrom / hitsPerPage l = [] @@ -686,11 +686,9 @@ class SearchResults: # list of pages to be shown page_range = range(*( cur_page - 4 < 0 and - (0, pages >= 10 and 10 or pages) - or - (cur_page - 4, cur_page + 6 > pages and - cur_page + (pages - 1 - cur_page) or - cur_page + 6))) + (0, pages >= 10 and 10 or pages) or + (cur_page - 4, cur_page + 6 >= pages and + pages or cur_page + 6))) l.extend([''.join([ i != cur_page and f.url(1, href=page_url(i)) or '', f.text(str(i+1)), # HG changeset patch # User Franz Pletz # Date 1154867636 -7200 # Node ID d7d17b4285f5643b604063a2c64e533d79ba04b7 # Parent 64c02d0697d5d4aeb0c4faf50cf81e1f3b93e99c ensure timings are correct for _xapianSearch diff -r 64c02d0697d5 -r d7d17b4285f5 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Sun Aug 06 14:12:26 2006 +0200 +++ b/MoinMoin/search/builtin.py Sun Aug 06 14:33:56 2006 +0200 @@ -421,10 +421,12 @@ class Search: pass #except AttributeError: # pages = [] - self.request.clock.stop('_xapianSearch') - - if not self.query.xapian_need_postproc(): - return self._getHits(hits, self._xapianMatch) + + try: + if not self.query.xapian_need_postproc(): + return self._getHits(hits, self._xapianMatch) + finally: + self.request.clock.stop('_xapianSearch') return self._moinSearch(pages) # HG changeset patch # User Franz Pletz # Date 1154876321 -7200 # Node ID 9b101f696445bbaee05b8541015aabe2eef90f3d # Parent d7d17b4285f5643b604063a2c64e533d79ba04b7 index domains of a page (standard, underlay) diff -r d7d17b4285f5 -r 9b101f696445 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sun Aug 06 14:33:56 2006 +0200 +++ b/MoinMoin/search/Xapian.py Sun Aug 06 16:58:41 2006 +0200 @@ -173,6 +173,7 @@ class Index(BaseIndex): 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in 'category': 'XCAT', # category this document belongs to 'full_title': 'XFT', # full title (for regex) + 'domain': 'XDOMAIN', # standard or underlay #Y year (four digits) } @@ -335,6 +336,12 @@ class Index(BaseIndex): return [cat.lower() for cat in re.findall(r'Category([^\s]+)', body[pos:])] + def _get_domains(self, page): + if page.isUnderlayPage(): + yield 'underlay' + if page.isStandardPage(): + yield 'standard' + def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @arg writer: the index writer object @@ -351,6 +358,7 @@ class Index(BaseIndex): # XXX: Hack until we get proper metadata language, stem_language = self._get_languages(page) categories = self._get_categories(page) + domains = tuple(self._get_domains(page)) updated = False if mode == 'update': @@ -385,6 +393,8 @@ class Index(BaseIndex): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) for category in categories: xkeywords.append(xapdoc.Keyword('category', category)) + for domain in domains: + xkeywords.append(xapdoc.Keyword('domain', domain)) xcontent = xapdoc.TextField('content', page.get_raw_body()) doc = xapdoc.Document(textFields=(xcontent, xtitle), keywords=xkeywords, diff -r d7d17b4285f5 -r 9b101f696445 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sun Aug 06 14:33:56 2006 +0200 +++ b/docs/CHANGES.fpletz Sun Aug 06 16:58:41 2006 +0200 @@ -211,4 +211,5 @@ 2006-08-05 .. 2006-08-06 2006-08-05 .. 2006-08-06 * (finally :)) Google-like paging, using images from google.com until we get proper moin gfx - + * index domains of a page (standard, underlay) + # HG changeset patch # User Franz Pletz # Date 1154886070 -7200 # Node ID b648187eac6c44327c91ade241812f510e5be3d8 # Parent 35e02195990607e68daf8de28b0a524052a69d7f added preliminary artwork for paging diff -r 35e021959906 -r b648187eac6c MoinMoin/search/results.py --- a/MoinMoin/search/results.py Sun Aug 06 18:55:31 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 06 19:41:10 2006 +0200 @@ -646,9 +646,7 @@ class SearchResults: def _img_url(self, img): cfg = self.request.cfg - # XXX: proper gfx - #return '%s/%s/img/%s' % (cfg.url_prefix, cfg.theme.name, img) - return 'http://www.google.com/intl/en/%s' % img + return '%s/%s/img/%s.png' % (cfg.url_prefix, self.request.theme.name, img) def formatPrevNextPageLinks(self, hitsFrom, hitsPerPage, hitsNum): """ Format previous and next page links in page @@ -696,7 +694,7 @@ class SearchResults: ]) for i in page_range]) # next page available - if cur_page < pages: + if cur_page < pages-1: l.append(''.join([ f.url(1, href=page_url(cur_page+1)), f.text(_('Next')), @@ -713,10 +711,10 @@ class SearchResults: l[0] and ''.join([ f.url(1, href=page_url(cur_page-1)), - f.image(self._img_url('nav_previous.gif')), + f.image(self._img_url('nav_prev')), f.url(0), ]) or - f.image(self._img_url('nav_first.gif')), + f.image(self._img_url('nav_first')), f.table_cell(0), # images for ooos, highlighted current page ''.join([ @@ -724,7 +722,7 @@ class SearchResults: f.table_cell(1), i != cur_page and f.url(1, href=page_url(i)) or '', f.image(self._img_url(i == cur_page and - 'nav_current.gif' or 'nav_page.gif')), + 'nav_current' or 'nav_page')), i != cur_page and f.url(0) or '', f.table_cell(0), ]) for i in page_range @@ -734,10 +732,10 @@ class SearchResults: l[-1] and ''.join([ f.url(1, href=page_url(cur_page+1)), - f.image(self._img_url('nav_next.gif')), + f.image(self._img_url('nav_next')), f.url(0), ]) or - f.image(self._img_url('nav_last.gif')), + f.image(self._img_url('nav_last')), f.table_cell(0), f.table_row(0), f.table_row(1), diff -r 35e021959906 -r b648187eac6c wiki/htdocs/modern/img/nav_current.png Binary file wiki/htdocs/modern/img/nav_current.png has changed diff -r 35e021959906 -r b648187eac6c wiki/htdocs/modern/img/nav_first.png Binary file wiki/htdocs/modern/img/nav_first.png has changed diff -r 35e021959906 -r b648187eac6c wiki/htdocs/modern/img/nav_last.png Binary file wiki/htdocs/modern/img/nav_last.png has changed diff -r 35e021959906 -r b648187eac6c wiki/htdocs/modern/img/nav_next.png Binary file wiki/htdocs/modern/img/nav_next.png has changed diff -r 35e021959906 -r b648187eac6c wiki/htdocs/modern/img/nav_page.png Binary file wiki/htdocs/modern/img/nav_page.png has changed diff -r 35e021959906 -r b648187eac6c wiki/htdocs/modern/img/nav_prev.png Binary file wiki/htdocs/modern/img/nav_prev.png has changed # HG changeset patch # User Franz Pletz # Date 1154886905 -7200 # Node ID 51579f15abf9b8c00006e22bb955de7bab9799cc # Parent b648187eac6c44327c91ade241812f510e5be3d8 small fixes for the gfx diff -r b648187eac6c -r 51579f15abf9 wiki/htdocs/modern/img/nav_current.png Binary file wiki/htdocs/modern/img/nav_current.png has changed diff -r b648187eac6c -r 51579f15abf9 wiki/htdocs/modern/img/nav_first.png Binary file wiki/htdocs/modern/img/nav_first.png has changed diff -r b648187eac6c -r 51579f15abf9 wiki/htdocs/modern/img/nav_prev.png Binary file wiki/htdocs/modern/img/nav_prev.png has changed # HG changeset patch # User Thomas Waldmann # Date 1154941697 -7200 # Node ID 0e6266605d55b352db5a773bba941ee764689f21 # Parent 017deaab4afdff4181c7b1109776b76d87b23e08 updated CHANGES.fpletz diff -r 017deaab4afd -r 0e6266605d55 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Jul 29 23:19:55 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Aug 07 11:08:17 2006 +0200 @@ -202,3 +202,13 @@ 2006-07-25 .. 2006-07-29 2006-07-25 .. 2006-07-29 * student did not work on project +2006-07-30 no work on project +2006-07-31 no work on project +2006-08-01 no work on project +2006-08-02 entry missing +2006-08-03 no work on project +2006-08-04 no work on project +2006-08-05 entry missing +2006-08-06 entry missing + + # HG changeset patch # User Franz Pletz # Date 1155045005 -7200 # Node ID 79ac7ab77ea86d34756b394a0c65f964f23aba20 # Parent 10099880cf8ff880674c6f478511e4e170545864 hit info bar for titlesearches, bugfix for getSearchResults diff -r 10099880cf8f -r 79ac7ab77ea8 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Aug 08 08:49:47 2006 +0200 +++ b/MoinMoin/search/results.py Tue Aug 08 15:50:05 2006 +0200 @@ -345,6 +345,7 @@ class SearchResults: self.formatTitle(page), f.pagelink(0, page.page_name), matchInfo, + self.formatHitInfoBar(page), f.listitem(0), ] write(''.join(item)) @@ -419,17 +420,7 @@ class SearchResults: f.definition_desc(1), fmt_context, f.definition_desc(0), - f.definition_desc(1, attr={'class': 'searchresinfobar'}), - f.text('%.1fk - ' % (page.page.size()/1024.0)), - f.text('rev: %d %s- ' % (page.page.get_real_rev(), - not page.page.rev and '(%s) ' % _('current') or '')), - f.text('last modified: %(time)s - ' % page.page.lastEditInfo()), - # XXX: proper metadata - #f.text('lang: %s - ' % page.page.language), - f.url(1, href='#'), - f.text(_('Similar pages')), - f.url(0), - f.definition_desc(0), + self.formatHitInfoBar(page), ] write(''.join(item)) write(f.definition_list(0)) @@ -747,6 +738,24 @@ class SearchResults: f.table(0), ]) + def formatHitInfoBar(self, page): + f = self.formatter + _ = self.request.getText + return ''.join([ + f.paragraph(1, attr={'class': 'searchhitinfobar'}), + f.text('%.1fk - ' % (page.page.size()/1024.0)), + f.text('rev: %d %s- ' % (page.page.get_real_rev(), + not page.page.rev and '(%s) ' % _('current') or '')), + f.text('last modified: %(time)s' % page.page.lastEditInfo()), + # XXX: proper metadata + #f.text('lang: %s - ' % page.page.language), + #f.url(1, href='#'), + #f.text(_('Similar pages')), + #f.url(0), + f.paragraph(0), + ]) + + def querystring(self, querydict=None): """ Return query string, used in the page link """ if querydict is None: @@ -797,11 +806,13 @@ def getSearchResults(request, query, hit for wikiname, page, attachment, match in hits: if wikiname in (request.cfg.interwikiname, 'Self'): # a local match if attachment: - result_hits.append(FoundAttachment(page.page_name, attachment)) + result_hits.append(FoundAttachment(page.page_name, + attachment, page=page)) else: - result_hits.append(FoundPage(page.page_name, match)) + result_hits.append(FoundPage(page.page_name, match, page)) else: - result_hits.append(FoundRemote(wikiname, page, attachment, match)) + result_hits.append(FoundRemote(wikiname, page.page_name, + attachment, match, page)) elapsed = time.time() - start count = request.rootpage.getPageCount() return SearchResults(query, result_hits, count, elapsed) diff -r 10099880cf8f -r 79ac7ab77ea8 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Aug 08 08:49:47 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Aug 08 15:50:05 2006 +0200 @@ -216,3 +216,8 @@ 2006-08-05 .. 2006-08-06 we get proper moin gfx * index domains of a page (standard, underlay) +2006-08-07 + * info bar for titlesearches + * bugfix for results code: sometimes we never got a page instance + in Found{Page,Attachment,...} which yielded strange errors + diff -r 10099880cf8f -r 79ac7ab77ea8 wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Tue Aug 08 08:49:47 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Tue Aug 08 15:50:05 2006 +0200 @@ -337,13 +337,14 @@ div.codearea pre span.DiffSeparator {col font-weight: normal; } -.searchresults dd { +.searchresults dd, .searchresults p { font-size: 0.85em; } -.searchresults dd.searchresinfobar { +.searchresults .searchhitinfobar { color: #008000; margin-left: 15px; + margin-top: 0; } p.searchstats { # HG changeset patch # User Franz Pletz # Date 1155047424 -7200 # Node ID e38e27967a97a5b1fb9f088e3a597c705977b220 # Parent 79ac7ab77ea86d34756b394a0c65f964f23aba20 added more timers to xapian code diff -r 79ac7ab77ea8 -r e38e27967a97 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Tue Aug 08 15:50:05 2006 +0200 +++ b/MoinMoin/search/builtin.py Tue Aug 08 16:30:24 2006 +0200 @@ -394,17 +394,20 @@ class Search: Get a list of pages using fast xapian search and return moin search in those pages. """ + clock = self.request.clock pages = None index = self._xapianIndex(self.request) if index and self.query.xapian_wanted(): - self.request.clock.start('_xapianSearch') + clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap + clock.start('_xapianQuery') query = self.query.xapian_term(self.request, index.allterms) self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) enq, hits = index.search(query) + clock.stop('_xapianQuery') self.request.log("xapianSearch: finds: %r" % hits) def dict_decode(d): """ decode dict values to unicode """ @@ -424,9 +427,13 @@ class Search: try: if not self.query.xapian_need_postproc(): - return self._getHits(hits, self._xapianMatch) + clock.start('_xapianProcess') + try: + return self._getHits(hits, self._xapianMatch) + finally: + clock.stop('_xapianProcess') finally: - self.request.clock.stop('_xapianSearch') + clock.stop('_xapianSearch') return self._moinSearch(pages) diff -r 79ac7ab77ea8 -r e38e27967a97 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Aug 08 15:50:05 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Aug 08 16:30:24 2006 +0200 @@ -221,3 +221,6 @@ 2006-08-07 * bugfix for results code: sometimes we never got a page instance in Found{Page,Attachment,...} which yielded strange errors +2006-08-08 + * added some more timers for regression testing + # HG changeset patch # User Franz Pletz # Date 1155067825 -7200 # Node ID c6ae37934d31b60f48775441a58c700aa853f600 # Parent e38e27967a97a5b1fb9f088e3a597c705977b220 saner url manipulation diff -r e38e27967a97 -r c6ae37934d31 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Aug 08 16:30:24 2006 +0200 +++ b/MoinMoin/search/results.py Tue Aug 08 22:10:25 2006 +0200 @@ -651,10 +651,11 @@ class SearchResults: _ = self.request.getText f = self.formatter - # url magic - from_re = r'\&from=[\d]+' - uri = re.sub(from_re, '', self.request.request_uri) - page_url = lambda n: '%s&from=%i' % (uri, n * hitsPerPage) + querydict = wikiutil.parseQueryString(self.request.query_string) + uri_prefix = self.request.splitURI(self.request.request_uri)[0] + def page_url(n): + querydict.update({'from': n * hitsPerPage}) + return uri_prefix + '?' + wikiutil.makeQueryString(querydict) pages = float(hitsNum) / hitsPerPage if pages - int(pages) > 0.0: # HG changeset patch # User Franz Pletz # Date 1155069555 -7200 # Node ID d2d160c344b7828cdd60232ba324a35e7f574a67 # Parent c6ae37934d31b60f48775441a58c700aa853f600 improved highlighting code to work better with stemming and special searches, extended SystemInfo macro diff -r c6ae37934d31 -r d2d160c344b7 MoinMoin/macro/SystemInfo.py --- a/MoinMoin/macro/SystemInfo.py Tue Aug 08 22:10:25 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Tue Aug 08 22:39:15 2006 +0200 @@ -123,6 +123,7 @@ def execute(Macro, args): _('N/A')) row(_('Xapian search'), '%s, %s, %s' % (xapState[request.cfg.xapian_search], available, mtime)) + row(_('Xapian stemming'), xapState[request.cfg.xapian_stemming]) row(_('Active threads'), t_count or _('N/A')) buf.write(u'') diff -r c6ae37934d31 -r d2d160c344b7 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Tue Aug 08 22:10:25 2006 +0200 +++ b/MoinMoin/search/queryparser.py Tue Aug 08 22:39:15 2006 +0200 @@ -336,7 +336,7 @@ class TextSearch(BaseExpression): tmp = [] for w, s, pos in analyzer.tokenize(t, flat_stemming=False): tmp.append(UnicodeQuery(Query.OP_OR, (w, s))) - stemmed.append(w) + stemmed.append(s) t = tmp else: # just not stemmed @@ -344,8 +344,10 @@ class TextSearch(BaseExpression): queries.append(Query(Query.OP_AND, t)) if not self.case and stemmed: - self._build_re(' '.join(stemmed), use_re=False, - case=self.case, stemmed=True) + new_pat = ' '.join(stemmed) + self._pattern = new_pat + self._build_re(new_pat, use_re=False, case=self.case, + stemmed=True) # titlesearch OR parsed wikiwords return Query(Query.OP_OR, @@ -457,7 +459,7 @@ class TitleSearch(BaseExpression): tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' % (Xapian.Index.prefixMap['title'], j) for j in (w, s)])) - stemmed.append(w) + stemmed.append(s) t = tmp else: # just not stemmed @@ -467,8 +469,10 @@ class TitleSearch(BaseExpression): queries.append(Query(Query.OP_AND, t)) if not self.case and stemmed: - self._build_re(' '.join(stemmed), use_re=False, - case=self.case, stemmed=True) + new_pat = ' '.join(stemmed) + self._pattern = new_pat + self._build_re(new_pat, use_re=False, case=self.case, + stemmed=True) return Query(Query.OP_AND, queries) @@ -635,6 +639,7 @@ class LanguageSearch(BaseExpression): pattern = self.pattern return UnicodeQuery('%s%s' % (prefix, pattern)) + class CategorySearch(TextSearch): """ Search the pages belonging to a category """ @@ -655,7 +660,7 @@ class CategorySearch(TextSearch): return u'%s!"%s"' % (neg, unicode(self._pattern)) def highlight_re(self): - return "" + return u'(Category%s)' % self._pattern def xapian_wanted(self): return True # only easy regexps possible diff -r c6ae37934d31 -r d2d160c344b7 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Aug 08 22:10:25 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Aug 08 22:39:15 2006 +0200 @@ -12,9 +12,6 @@ Branch moin/1.6-xapian-fpletz * Write/update documentation for all the new search stuff * Reevaluate Xapwrap, possibly drop it and rip out usable stuff (i.e. ExceptionTranslator) - * Add stemming support for highlighting stuff: - 1. regexp for whole word (all lowercase), or - 2. just the root of the word ToDo (low priority): * Case-sensitive searches / Regexp on multiple terms: Graceful @@ -223,4 +220,6 @@ 2006-08-07 2006-08-08 * added some more timers for regression testing - + * improved highlighting code to work better with stemming and + special searches, extended SystemInfo macro + # HG changeset patch # User Franz Pletz # Date 1155167261 -7200 # Node ID 0a947454dec744b50967e4ed17cf399209c4722a # Parent d2d160c344b7828cdd60232ba324a35e7f574a67 use xapian for sorting search results diff -r d2d160c344b7 -r 0a947454dec7 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Tue Aug 08 22:39:15 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Thu Aug 10 01:47:41 2006 +0200 @@ -55,11 +55,19 @@ def execute(pagename, request, fieldname Page(request, pagename).send_page(request, msg=err) return + # Setup for type of search + if titlesearch: + title = _('Title Search: "%s"') + sort = 'page_name' + else: + title = _('Full Text Search: "%s"') + sort = 'weight' + # search the pages from MoinMoin.search import searchPages, QueryParser query = QueryParser(case=case, regex=regex, titlesearch=titlesearch).parse_query(needle) - results = searchPages(request, query) + results = searchPages(request, query, sort) # directly show a single hit # XXX won't work with attachment search @@ -78,14 +86,6 @@ def execute(pagename, request, fieldname # This action generate data using the user language request.setContentLanguage(request.lang) - - # Setup for type of search - if titlesearch: - title = _('Title Search: "%s"') - results.sortByPagename() - else: - title = _('Full Text Search: "%s"') - results.sortByWeight() request.theme.send_title(title % needle, form=request.form, pagename=pagename) diff -r d2d160c344b7 -r 0a947454dec7 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Tue Aug 08 22:39:15 2006 +0200 +++ b/MoinMoin/search/Xapian.py Thu Aug 10 01:47:41 2006 +0200 @@ -195,7 +195,7 @@ class Index(BaseIndex): """ Check if the Xapian index exists """ return BaseIndex.exists(self) and os.listdir(self.dir) - def _search(self, query): + def _search(self, query, sort=None): """ read lock must be acquired """ while True: try: @@ -210,7 +210,16 @@ class Index(BaseIndex): timestamp = self.mtime() break - hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname']) + kw = {} + if sort == 'weight': + # XXX: we need real weight here, like _moinSearch + # (TradWeight in xapian) + kw['sortByRelevence'] = True + if sort == 'page_name': + kw['sortKey'] = 'pagename' + + hits = searcher.search(query, valuesWanted=['pagename', + 'attachment', 'mtime', 'wikiname'], **kw) self.request.cfg.xapian_searchers.append((searcher, timestamp)) return hits diff -r d2d160c344b7 -r 0a947454dec7 MoinMoin/search/__init__.py --- a/MoinMoin/search/__init__.py Tue Aug 08 22:39:15 2006 +0200 +++ b/MoinMoin/search/__init__.py Thu Aug 10 01:47:41 2006 +0200 @@ -13,7 +13,7 @@ from MoinMoin.search.queryparser import from MoinMoin.search.queryparser import QueryParser from MoinMoin.search.builtin import Search -def searchPages(request, query, **kw): +def searchPages(request, query, sort='weight', **kw): """ Search the text of all pages for query. @param request: current request @@ -23,5 +23,5 @@ def searchPages(request, query, **kw): """ if isinstance(query, str) or isinstance(query, unicode): query = QueryParser(**kw).parse_query(query) - return Search(request, query).run() + return Search(request, query, sort).run() diff -r d2d160c344b7 -r 0a947454dec7 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Tue Aug 08 22:39:15 2006 +0200 +++ b/MoinMoin/search/builtin.py Thu Aug 10 01:47:41 2006 +0200 @@ -174,11 +174,11 @@ class BaseIndex: def _search(self, query): raise NotImplemented('...') - def search(self, query): + def search(self, query, *args, **kw): #if not self.read_lock.acquire(1.0): # raise self.LockedException #try: - hits = self._search(query) + hits = self._search(query, *args, **kw) #finally: # self.read_lock.release() return hits @@ -352,9 +352,10 @@ class Search: class Search: """ A search run """ - def __init__(self, request, query): + def __init__(self, request, query, sort='weight'): self.request = request self.query = query + self.sort = sort self.filtered = False self.fs_rootpage = "FS" # XXX FS hardcoded @@ -370,7 +371,12 @@ class Search: if not self.filtered: hits = self._filter(hits) - return getSearchResults(self.request, self.query, hits, start) + # when xapian was used, we won't need to sort manually + if self.request.cfg.xapian_search: + self.sort = None + + return getSearchResults(self.request, self.query, hits, start, + self.sort) # ---------------------------------------------------------------- @@ -406,9 +412,9 @@ class Search: self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) - enq, hits = index.search(query) + enq, hits = index.search(query, sort=self.sort) clock.stop('_xapianQuery') - self.request.log("xapianSearch: finds: %r" % hits) + #self.request.log("xapianSearch: finds: %r" % hits) def dict_decode(d): """ decode dict values to unicode """ for k, v in d.items(): @@ -434,6 +440,9 @@ class Search: clock.stop('_xapianProcess') finally: clock.stop('_xapianSearch') + else: + # we didn't use xapian in this request + self.request.cfg.xapian_search = 0 return self._moinSearch(pages) diff -r d2d160c344b7 -r 0a947454dec7 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Tue Aug 08 22:39:15 2006 +0200 +++ b/MoinMoin/search/queryparser.py Thu Aug 10 01:47:41 2006 +0200 @@ -436,7 +436,7 @@ class TitleSearch(BaseExpression): if term[:4] == 'XFT:': found = True if self.search_re.findall(term[4:]): - terms.append(term) + terms.append(Query(term, 100)) elif found: break if not terms: @@ -456,15 +456,19 @@ class TitleSearch(BaseExpression): # stemmed OR not stemmed tmp = [] for w, s, pos in analyzer.tokenize(t, flat_stemming=False): - tmp.append(UnicodeQuery(Query.OP_OR, - ['%s%s' % (Xapian.Index.prefixMap['title'], j) + tmp.append(Query(Query.OP_OR, + [UnicodeQuery('%s%s' % + (Xapian.Index.prefixMap['title'], j), + 100) for j in (w, s)])) stemmed.append(s) t = tmp else: # just not stemmed - t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], w)) - for w, pos in analyzer.tokenize(t)] + t = [UnicodeQuery( + '%s%s' % (Xapian.Index.prefixMap['title'], w), + 100) + for w, pos in analyzer.tokenize(t)] queries.append(Query(Query.OP_AND, t)) diff -r d2d160c344b7 -r 0a947454dec7 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Aug 08 22:39:15 2006 +0200 +++ b/MoinMoin/search/results.py Thu Aug 10 01:47:41 2006 +0200 @@ -244,27 +244,30 @@ class SearchResults: """ # Public functions -------------------------------------------------- - def __init__(self, query, hits, pages, elapsed): + def __init__(self, query, hits, pages, elapsed, sort=None): self.query = query # the query self.hits = hits # hits list - self.sort = None # hits are unsorted initially self.pages = pages # number of pages in the wiki self.elapsed = elapsed # search time - def sortByWeight(self): + if sort == 'weight': + self._sortByWeight() + elif sort == 'page_name': + self.sortByPagename() + self.sort = sort + + def _sortByWeight(self): """ Sorts found pages by the weight of the matches """ tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits] tmp.sort() tmp.reverse() self.hits = [item[2] for item in tmp] - self.sort = 'weight' - - def sortByPagename(self): + + def _sortByPagename(self): """ Sorts a list of found pages alphabetical by page name """ tmp = [(hit.page_name, hit) for hit in self.hits] tmp.sort() self.hits = [item[1] for item in tmp] - self.sort = 'page_name' def stats(self, request, formatter, hitsFrom): """ Return search statistics, formatted with formatter @@ -802,7 +805,7 @@ class SearchResults: self.matchLabel = (_('match'), _('matches')) -def getSearchResults(request, query, hits, start): +def getSearchResults(request, query, hits, start, sort=None): result_hits = [] for wikiname, page, attachment, match in hits: if wikiname in (request.cfg.interwikiname, 'Self'): # a local match @@ -816,5 +819,5 @@ def getSearchResults(request, query, hit attachment, match, page)) elapsed = time.time() - start count = request.rootpage.getPageCount() - return SearchResults(query, result_hits, count, elapsed) - + return SearchResults(query, result_hits, count, elapsed, sort) + diff -r d2d160c344b7 -r 0a947454dec7 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Aug 08 22:39:15 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Aug 10 01:47:41 2006 +0200 @@ -223,3 +223,7 @@ 2006-08-08 * improved highlighting code to work better with stemming and special searches, extended SystemInfo macro +2006-08-09 + * use xapian for sorting, first step not to fetch all results + -> still TODO: need real weight + # HG changeset patch # User Franz Pletz # Date 1155222932 -7200 # Node ID 20f59999e79c1b8a0f0e5ac2b159c69dd35477f3 # Parent 820518f0118ea376a497c2c2ac1ccc7e76a2df19 only show page links when we got enough results diff -r 820518f0118e -r 20f59999e79c MoinMoin/search/results.py --- a/MoinMoin/search/results.py Thu Aug 10 13:28:52 2006 +0200 +++ b/MoinMoin/search/results.py Thu Aug 10 17:15:32 2006 +0200 @@ -316,6 +316,9 @@ class SearchResults: list = lambda on: f.number_list(on, start=hitsFrom+1) else: list = f.bullet_list + + if paging and len(self.hits) <= request.cfg.search_results_per_page: + paging = False # Add pages formatted as list if self.hits: @@ -383,6 +386,9 @@ class SearchResults: f = formatter write = self.buffer.write _ = request.getText + + if paging and len(self.hits) <= request.cfg.search_results_per_page: + paging = False # Add pages formatted as definition list if self.hits: # HG changeset patch # User Franz Pletz # Date 1155223376 -7200 # Node ID d2eadfef54b8d1082711650bb9f3120700c52f7e # Parent 20f59999e79c1b8a0f0e5ac2b159c69dd35477f3 right/left-aligning for prev/next images diff -r 20f59999e79c -r d2eadfef54b8 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Thu Aug 10 17:15:32 2006 +0200 +++ b/MoinMoin/search/results.py Thu Aug 10 17:22:56 2006 +0200 @@ -707,7 +707,7 @@ class SearchResults: return ''.join([ f.table(1, attrs={'tableclass': 'searchpages'}), f.table_row(1), - f.table_cell(1), + f.table_cell(1, attrs={'class': 'prev'}), # first image, previous page l[0] and ''.join([ @@ -728,7 +728,7 @@ class SearchResults: f.table_cell(0), ]) for i in page_range ]), - f.table_cell(1), + f.table_cell(1, attrs={'class': 'next'}), # last image, next page l[-1] and ''.join([ diff -r 20f59999e79c -r d2eadfef54b8 wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Thu Aug 10 17:15:32 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Thu Aug 10 17:22:56 2006 +0200 @@ -371,6 +371,14 @@ p.searchstats { font-weight: bold; } +.searchpages td.prev { + text-align: right; +} + +.searchpage td.next { + text-align: left; +} + .searchpages td a, .searchpages td a:link { color: #000000; text-decoration: underline; # HG changeset patch # User Franz Pletz # Date 1155224630 -7200 # Node ID cba856bc0c059e3b259da9e3a8325aeaff38e606 # Parent d2eadfef54b8d1082711650bb9f3120700c52f7e estimate numer of hits correctly diff -r d2eadfef54b8 -r cba856bc0c05 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Thu Aug 10 17:22:56 2006 +0200 +++ b/MoinMoin/search/Xapian.py Thu Aug 10 17:43:50 2006 +0200 @@ -262,7 +262,7 @@ class Index(BaseIndex): mtime = wikiutil.timestamp2version(mtime) if mode == 'update': query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) - enq, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) if docs: doc = docs[0] # there should be only one uid = doc['uid'] @@ -375,7 +375,7 @@ class Index(BaseIndex): # you can just call database.replace_document(uid_term, doc) # -> done in xapwrap.index.Index.index() query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) - enq, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) if docs: doc = docs[0] # there should be only one uid = doc['uid'] @@ -429,7 +429,7 @@ class Index(BaseIndex): mtime = wikiutil.timestamp2version(os.path.getmtime(filename)) if mode == 'update': query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid)) - enq, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) + enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) if debug: request.log("##%r %r" % (filename, docs)) if docs: doc = docs[0] # there should be only one diff -r d2eadfef54b8 -r cba856bc0c05 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Thu Aug 10 17:22:56 2006 +0200 +++ b/MoinMoin/search/builtin.py Thu Aug 10 17:43:50 2006 +0200 @@ -374,9 +374,17 @@ class Search: # when xapian was used, we won't need to sort manually if self.request.cfg.xapian_search: self.sort = None + mset = self._xapianMset + estimated_hits = ( + (mset.get_matches_estimated() == mset.get_matches_upper_bound() and + mset.get_matches_estimated() == mset.get_matches_lower_bound()) and + '' or 'about', + mset.get_matches_estimated()) + else: + estimated_hits = None return getSearchResults(self.request, self.query, hits, start, - self.sort) + self.sort, estimated_hits) # ---------------------------------------------------------------- @@ -412,7 +420,7 @@ class Search: self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) - enq, hits = index.search(query, sort=self.sort) + enq, mset, hits = index.search(query, sort=self.sort) clock.stop('_xapianQuery') #self.request.log("xapianSearch: finds: %r" % hits) def dict_decode(d): @@ -425,6 +433,7 @@ class Search: pages = [dict_decode(hit['values']) for hit in hits] self.request.log("xapianSearch: finds pages: %r" % pages) self._xapianEnquire = enq + self._xapianMset = mset self._xapianIndex = index except BaseIndex.LockedException: pass diff -r d2eadfef54b8 -r cba856bc0c05 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Thu Aug 10 17:22:56 2006 +0200 +++ b/MoinMoin/search/results.py Thu Aug 10 17:43:50 2006 +0200 @@ -244,11 +244,12 @@ class SearchResults: """ # Public functions -------------------------------------------------- - def __init__(self, query, hits, pages, elapsed, sort=None): + def __init__(self, query, hits, pages, elapsed, sort, estimated_hits): self.query = query # the query self.hits = hits # hits list self.pages = pages # number of pages in the wiki self.elapsed = elapsed # search time + self.estimated_hits = estimated_hits # about how much hits? if sort == 'weight': self._sortByWeight() @@ -279,12 +280,17 @@ class SearchResults: @return formatted statistics """ _ = request.getText + + if not self.estimated_hits: + self.estimated_hits = ('', len(self.hits)) + output = [ formatter.paragraph(1, attr={'class': 'searchstats'}), _("Results %(bs)s%(hitsFrom)d - %(hitsTo)d%(be)s " - "of about %(bs)s%(hits)d%(be)s results out of about " - "%(pages)d pages.") % - {'hits': len(self.hits), 'pages': self.pages, + "of %(aboutHits)s %(bs)s%(hits)d%(be)s results out of" + "about %(pages)d pages.") % + {'aboutHits': self.estimated_hits[0], + 'hits': self.estimated_hits[1], 'pages': self.pages, 'hitsFrom': hitsFrom + 1, 'hitsTo': hitsFrom + request.cfg.search_results_per_page, 'bs': formatter.strong(1), 'be': formatter.strong(0)}, @@ -811,7 +817,7 @@ class SearchResults: self.matchLabel = (_('match'), _('matches')) -def getSearchResults(request, query, hits, start, sort=None): +def getSearchResults(request, query, hits, start, sort, estimated_hits): result_hits = [] for wikiname, page, attachment, match in hits: if wikiname in (request.cfg.interwikiname, 'Self'): # a local match @@ -825,5 +831,6 @@ def getSearchResults(request, query, hit attachment, match, page)) elapsed = time.time() - start count = request.rootpage.getPageCount() - return SearchResults(query, result_hits, count, elapsed, sort) - + return SearchResults(query, result_hits, count, elapsed, sort, + estimated_hits) + diff -r d2eadfef54b8 -r cba856bc0c05 MoinMoin/support/xapwrap/index.py --- a/MoinMoin/support/xapwrap/index.py Thu Aug 10 17:22:56 2006 +0200 +++ b/MoinMoin/support/xapwrap/index.py Thu Aug 10 17:43:50 2006 +0200 @@ -635,7 +635,7 @@ class ReadOnlyIndex: valRes[valName] = xapDoc.get_value(valueIndex) thisResult['values'] = valRes results.append(thisResult) - return enq, results + return enq, mset, results except: del enq, mset raise # HG changeset patch # User Franz Pletz # Date 1155226241 -7200 # Node ID d58efa0c4ce82d3d271f478fb14b4645fd0f8fdd # Parent cba856bc0c059e3b259da9e3a8325aeaff38e606 show real range count of results diff -r cba856bc0c05 -r d58efa0c4ce8 MoinMoin/search/results.py --- a/MoinMoin/search/results.py Thu Aug 10 17:43:50 2006 +0200 +++ b/MoinMoin/search/results.py Thu Aug 10 18:10:41 2006 +0200 @@ -292,7 +292,9 @@ class SearchResults: {'aboutHits': self.estimated_hits[0], 'hits': self.estimated_hits[1], 'pages': self.pages, 'hitsFrom': hitsFrom + 1, - 'hitsTo': hitsFrom + request.cfg.search_results_per_page, + 'hitsTo': hitsFrom + + min(self.estimated_hits[1] - hitsFrom, + request.cfg.search_results_per_page), 'bs': formatter.strong(1), 'be': formatter.strong(0)}, u' (%s %s)' % (''.join([formatter.strong(1), formatter.text("%.2f" % self.elapsed), # HG changeset patch # User Thomas Waldmann # Date 1155539061 -7200 # Node ID 96e852930e24fbffca54d0154e948a520ea90cf1 # Parent d58efa0c4ce82d3d271f478fb14b4645fd0f8fdd added missing CHANGES entries diff -r d58efa0c4ce8 -r 96e852930e24 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Thu Aug 10 18:10:41 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Aug 14 09:04:21 2006 +0200 @@ -227,3 +227,8 @@ 2006-08-09 * use xapian for sorting, first step not to fetch all results -> still TODO: need real weight +2006-08-10 + * entry missing + +2006-08-10 .. 13 no work on project + # HG changeset patch # User Franz Pletz # Date 1155590934 -7200 # Node ID 835c392be95d0133cdf0d594fb98ebabd953ed0d # Parent 135aa02138b7761d2268e21ac9977bee2c97d146 improvements for SystemInfo macro diff -r 135aa02138b7 -r 835c392be95d MoinMoin/macro/SystemInfo.py --- a/MoinMoin/macro/SystemInfo.py Mon Aug 14 17:26:28 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Mon Aug 14 23:28:54 2006 +0200 @@ -115,14 +115,18 @@ def execute(Macro, args): from MoinMoin.search.builtin import Search xapState = (_('Disabled'), _('Enabled')) idxState = (_('index available'), _('index unavailable')) - idx = Search._xapianIndex(request) - available = idx and idxState[0] or idxState[1] - mtime = _('last modified: %s') % (idx and - request.user.getFormattedDateTime( - wikiutil.version2timestamp(idx.mtime())) or - _('N/A')) - row(_('Xapian search'), '%s, %s, %s' - % (xapState[request.cfg.xapian_search], available, mtime)) + out = xapState[request.cfg.xapian_search] + + if request.cfg.xapian_search: + idx = Search._xapianIndex(request) + available = idx and idxState[0] or idxState[1] + mtime = _('last modified: %s') % (idx and + request.user.getFormattedDateTime( + wikiutil.version2timestamp(idx.mtime())) or + _('N/A')) + out += ', %s, %s' % (available, mtime) + + row(_('Xapian search'), out) row(_('Xapian stemming'), xapState[request.cfg.xapian_stemming]) row(_('Active threads'), t_count or _('N/A')) # HG changeset patch # User Franz Pletz # Date 1155592368 -7200 # Node ID 6d5d345599a1b82c39c056e2e2ee1caef263ffcf # Parent 835c392be95d0133cdf0d594fb98ebabd953ed0d special message when no results found diff -r 835c392be95d -r 6d5d345599a1 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Mon Aug 14 23:28:54 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Mon Aug 14 23:52:48 2006 +0200 @@ -92,18 +92,27 @@ def execute(pagename, request, fieldname # Start content (important for RTL support) request.write(request.formatter.startContent("content")) - # First search stats - request.write(results.stats(request, request.formatter, hitsFrom)) + # Did we get any hits? + if results.hits: + # First search stats + request.write(results.stats(request, request.formatter, hitsFrom)) - # Then search results - info = not titlesearch - if context: - output = results.pageListWithContext(request, request.formatter, - info=info, context=context, hitsFrom=hitsFrom) + # Then search results + info = not titlesearch + if context: + output = results.pageListWithContext(request, request.formatter, + info=info, context=context, hitsFrom=hitsFrom) + else: + output = results.pageList(request, request.formatter, info=info, + hitsFrom=hitsFrom) + request.write(output) else: - output = results.pageList(request, request.formatter, info=info, - hitsFrom=hitsFrom) - request.write(output) + f = request.formatter + request.write(''.join([ + f.heading(1, 3), + f.text(_('Your search query didn\'t return any results.')), + f.heading(0, 3), + ])) request.write(request.formatter.endContent()) request.theme.send_footer(pagename) diff -r 835c392be95d -r 6d5d345599a1 MoinMoin/macro/SystemInfo.py --- a/MoinMoin/macro/SystemInfo.py Mon Aug 14 23:28:54 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Mon Aug 14 23:52:48 2006 +0200 @@ -115,7 +115,7 @@ def execute(Macro, args): from MoinMoin.search.builtin import Search xapState = (_('Disabled'), _('Enabled')) idxState = (_('index available'), _('index unavailable')) - out = xapState[request.cfg.xapian_search] + xapRow = xapState[request.cfg.xapian_search] if request.cfg.xapian_search: idx = Search._xapianIndex(request) @@ -124,9 +124,9 @@ def execute(Macro, args): request.user.getFormattedDateTime( wikiutil.version2timestamp(idx.mtime())) or _('N/A')) - out += ', %s, %s' % (available, mtime) + xapRow += ', %s, %s' % (available, mtime) - row(_('Xapian search'), out) + row(_('Xapian search'), xapRow) row(_('Xapian stemming'), xapState[request.cfg.xapian_stemming]) row(_('Active threads'), t_count or _('N/A')) # HG changeset patch # User Franz Pletz # Date 1155593167 -7200 # Node ID e26799c574b007abf9259068ac8db206042a225b # Parent 6d46d6627b2f8d4313b80b26fcbdc4d5d8916548 small bugfix for info action diff -r 6d46d6627b2f -r e26799c574b0 MoinMoin/action/info.py --- a/MoinMoin/action/info.py Tue Aug 15 00:01:43 2006 +0200 +++ b/MoinMoin/action/info.py Tue Aug 15 00:06:07 2006 +0200 @@ -90,7 +90,7 @@ def execute(pagename, request): may_revert = request.user.may.revert(pagename) def render_action(text, query, **kw): - kw.update(rel='nofollow') + kw.update(dict(rel='nofollow')) if 0: # diff button doesnt work XXX params_html = [] for k, v in query.items(): # HG changeset patch # User Franz Pletz # Date 1155671907 -7200 # Node ID a90ca97f1a9bfa5eb966340fd88bcaaa1ac6eef6 # Parent e26799c574b007abf9259068ac8db206042a225b removed mooooin gfx diff -r e26799c574b0 -r a90ca97f1a9b MoinMoin/search/results.py --- a/MoinMoin/search/results.py Tue Aug 15 00:06:07 2006 +0200 +++ b/MoinMoin/search/results.py Tue Aug 15 21:58:27 2006 +0200 @@ -287,7 +287,7 @@ class SearchResults: output = [ formatter.paragraph(1, attr={'class': 'searchstats'}), _("Results %(bs)s%(hitsFrom)d - %(hitsTo)d%(be)s " - "of %(aboutHits)s %(bs)s%(hits)d%(be)s results out of" + "of %(aboutHits)s %(bs)s%(hits)d%(be)s results out of " "about %(pages)d pages.") % {'aboutHits': self.estimated_hits[0], 'hits': self.estimated_hits[1], 'pages': self.pages, @@ -652,10 +652,6 @@ class SearchResults: return ''.join(output) return '' - def _img_url(self, img): - cfg = self.request.cfg - return '%s/%s/img/%s.png' % (cfg.url_prefix, self.request.theme.name, img) - def formatPrevNextPageLinks(self, hitsFrom, hitsPerPage, hitsNum): """ Format previous and next page links in page @@ -713,39 +709,6 @@ class SearchResults: return ''.join([ f.table(1, attrs={'tableclass': 'searchpages'}), - f.table_row(1), - f.table_cell(1, attrs={'class': 'prev'}), - # first image, previous page - l[0] and - ''.join([ - f.url(1, href=page_url(cur_page-1)), - f.image(self._img_url('nav_prev')), - f.url(0), - ]) or - f.image(self._img_url('nav_first')), - f.table_cell(0), - # images for ooos, highlighted current page - ''.join([ - ''.join([ - f.table_cell(1), - i != cur_page and f.url(1, href=page_url(i)) or '', - f.image(self._img_url(i == cur_page and - 'nav_current' or 'nav_page')), - i != cur_page and f.url(0) or '', - f.table_cell(0), - ]) for i in page_range - ]), - f.table_cell(1, attrs={'class': 'next'}), - # last image, next page - l[-1] and - ''.join([ - f.url(1, href=page_url(cur_page+1)), - f.image(self._img_url('nav_next')), - f.url(0), - ]) or - f.image(self._img_url('nav_last')), - f.table_cell(0), - f.table_row(0), f.table_row(1), f.table_cell(1), # textlinks @@ -772,7 +735,6 @@ class SearchResults: f.paragraph(0), ]) - def querystring(self, querydict=None): """ Return query string, used in the page link """ if querydict is None: diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Tue Aug 15 00:06:07 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Tue Aug 15 21:58:27 2006 +0200 @@ -363,26 +363,17 @@ p.searchstats { .searchpages tr, .searchpages td { border: 0; - padding: 0; + padding: 5px; margin: 0; text-align: center; vertical-align: middle; - color: #a90a08; + color: #b93a58; font-weight: bold; -} - -.searchpages td.prev { - text-align: right; -} - -.searchpage td.next { - text-align: left; + font-size: 1.05em; } .searchpages td a, .searchpages td a:link { - color: #000000; text-decoration: underline; - font-weight: normal; } /* MonthCalendar css */ diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/img/nav_current.png Binary file wiki/htdocs/modern/img/nav_current.png has changed diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/img/nav_first.png Binary file wiki/htdocs/modern/img/nav_first.png has changed diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/img/nav_last.png Binary file wiki/htdocs/modern/img/nav_last.png has changed diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/img/nav_next.png Binary file wiki/htdocs/modern/img/nav_next.png has changed diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/img/nav_page.png Binary file wiki/htdocs/modern/img/nav_page.png has changed diff -r e26799c574b0 -r a90ca97f1a9b wiki/htdocs/modern/img/nav_prev.png Binary file wiki/htdocs/modern/img/nav_prev.png has changed # HG changeset patch # User Franz Pletz # Date 1155672157 -7200 # Node ID 1194f8f91158b6f748508f64f8cfdc56dc5e6dd0 # Parent a90ca97f1a9bfa5eb966340fd88bcaaa1ac6eef6 updated CHANGES.fpletz diff -r a90ca97f1a9b -r 1194f8f91158 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Aug 15 21:58:27 2006 +0200 +++ b/docs/CHANGES.fpletz Tue Aug 15 22:02:37 2006 +0200 @@ -228,7 +228,14 @@ 2006-08-09 -> still TODO: need real weight 2006-08-10 - * entry missing + * corrected range and count of results (estimated by xapian) + * pagelinks only there are enough results 2006-08-10 .. 13 no work on project +2006-08-14 + * fixed some remaining issues with the ui + +2006-08-15 + * removed Moooin gfx as requested by Google + # HG changeset patch # User Thomas Waldmann # Date 1155829247 -7200 # Node ID c604db1542f5ea5320f34ee4d746b0815b171a0a # Parent 1194f8f91158b6f748508f64f8cfdc56dc5e6dd0 added missing CHANGES entries diff -r 1194f8f91158 -r c604db1542f5 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Tue Aug 15 22:02:37 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Aug 17 17:40:47 2006 +0200 @@ -239,3 +239,6 @@ 2006-08-15 2006-08-15 * removed Moooin gfx as requested by Google +2006-08-16 no work on project + + # HG changeset patch # User Franz Pletz # Date 1155796462 -7200 # Node ID 12ddd8661ad6f8ba49b9cf7c35f5a700f6f4f9a9 # Parent 1194f8f91158b6f748508f64f8cfdc56dc5e6dd0 basic structure for new AdvancedSearch macro, py2.4 fix for html formatter diff -r 1194f8f91158 -r 12ddd8661ad6 MoinMoin/formatter/text_html.py --- a/MoinMoin/formatter/text_html.py Tue Aug 15 22:02:37 2006 +0200 +++ b/MoinMoin/formatter/text_html.py Thu Aug 17 08:34:22 2006 +0200 @@ -6,7 +6,12 @@ @license: GNU GPL, see COPYING for details. """ import os.path, re -from sets import Set # TODO: when we require Python 2.4+ use the builtin 'set' type + +try: + set +except: + from sets import Set as set + from MoinMoin.formatter import FormatterBase from MoinMoin import wikiutil, i18n, config from MoinMoin.Page import Page @@ -16,7 +21,7 @@ prettyprint = False prettyprint = False # These are the HTML elements that we treat as block elements. -_blocks = Set(['dd', 'div', 'dl', 'dt', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', +_blocks = set(['dd', 'div', 'dl', 'dt', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'li', 'ol', 'p', 'pre', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul', 'blockquote', ]) @@ -26,30 +31,30 @@ _blocks = Set(['dd', 'div', 'dl', 'dt', # content, and also IE has a parsing bug with those two elements (only) # when they don't have a closing tag even if valid XHTML. -_self_closing_tags = Set(['area', 'base', 'br', 'col', 'frame', 'hr', 'img', 'input', - 'isindex', 'link', 'meta', 'param']) +_self_closing_tags = set(['area', 'base', 'br', 'col', 'frame', 'hr', 'img', + 'input', 'isindex', 'link', 'meta', 'param']) # We only open those tags and let the browser auto-close them: -_auto_closing_tags = Set(['p']) +_auto_closing_tags = set(['p']) # These are the elements which generally should cause an increase in the # indention level in the html souce code. -_indenting_tags = Set(['ol', 'ul', 'dl', 'li', 'dt', 'dd', 'tr', 'td']) +_indenting_tags = set(['ol', 'ul', 'dl', 'li', 'dt', 'dd', 'tr', 'td']) # These are the elements that discard any whitespace they contain as # immediate child nodes. -_space_eating_tags = Set(['colgroup', 'dl', 'frameset', 'head', 'map' 'menu', +_space_eating_tags = set(['colgroup', 'dl', 'frameset', 'head', 'map' 'menu', 'ol', 'optgroup', 'select', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'ul']) # These are standard HTML attributes which are typically used without any # value; e.g., as boolean flags indicated by their presence. -_html_attribute_boolflags = Set(['compact', 'disabled', 'ismap', 'nohref', +_html_attribute_boolflags = set(['compact', 'disabled', 'ismap', 'nohref', 'noresize', 'noshade', 'nowrap', 'readonly', 'selected', 'wrap']) # These are all the standard HTML attributes that are allowed on any element. -_common_attributes = Set(['accesskey', 'class', 'dir', 'disabled', 'id', 'lang', +_common_attributes = set(['accesskey', 'class', 'dir', 'disabled', 'id', 'lang', 'style', 'tabindex', 'title']) diff -r 1194f8f91158 -r 12ddd8661ad6 MoinMoin/macro/FullSearch.py --- a/MoinMoin/macro/FullSearch.py Tue Aug 15 22:02:37 2006 +0200 +++ b/MoinMoin/macro/FullSearch.py Thu Aug 17 08:34:22 2006 +0200 @@ -23,7 +23,8 @@ context argument, or make another macro that use context, which may be easier to use. - @copyright: 2000-2004 by Jürgen Hermann + @copyright: 2000-2004 by Jürgen Hermann , + 2005 MoinMoin:FranzPletz @license: GNU GPL, see COPYING for details. """ diff -r 1194f8f91158 -r 12ddd8661ad6 MoinMoin/macro/AdvancedSearch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/macro/AdvancedSearch.py Thu Aug 17 08:34:22 2006 +0200 @@ -0,0 +1,83 @@ +# -*- coding: iso-8859-1 -*- +''' + MoinMoin - AdvancedSearch Macro + + [[AdvancedSearch]] + displays advanced search dialog. + + MAYBE: + [[AdvancedSearch(Help)]] + embed results of an advanced search (use more parameters...) +''' + +from MoinMoin import config, wikiutil, search + +Dependencies = ['pages'] + +def advanced_ui(macro): + _ = macro._ + f = macro.formatter + + search_boxes = ''.join([ + ''.join([ + f.table_row(1), + f.table_cell(1), + f.text('%s:' % _(txt)), + f.table_cell(0), + f.table_cell(1), + f.rawHTML(input_field), + f.table_cell(0), + f.table_row(0), + ]) for txt, input_field in ( + ('Search for pages containing all the following terms', + ''), + ('Search for pages containing one or more of the following ' + 'terms', ''), + ('Search for pages not containing the following terms', + ''), + ('Search for pages containing only one of the following terms', + ''), + # TODO: dropdown-box? + ('Search for pages belonging to one of the following categories', + ''), + ) + ]) + + search_options = ''.join([ + ''.join([ + f.table_row(1), + f.table_cell(1, colspan=2), + f.text(_(txt)), + f.table_cell(0), + f.table_row(0), + ]) for txt in ('Language', 'xxxx') + ]) + + html = [ + u'
', + u'
', + u'', + u'' % 0, + f.table(1), + search_boxes, + search_options, + f.table(0), + u'' % _('Go get it!'), + u'
', + u'
', + ] + + return f.rawHTML('\n'.join(html)) + + +def execute(macro, needle): + request = macro.request + _ = request.getText + + # no args given + if needle is None: + return advanced_ui(macro) + + return macro.formatter.rawHTML('wooza!') + + # HG changeset patch # User Franz Pletz # Date 1155898601 -7200 # Node ID 4c29aeea8bf713244216f0633016ac03ebd4432f # Parent c3fcddbf0c511c732d81130a913820d736314051 languages dropdown menu diff -r c3fcddbf0c51 -r 4c29aeea8bf7 MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 11:43:50 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 12:56:41 2006 +0200 @@ -11,6 +11,7 @@ ''' from MoinMoin import config, wikiutil, search +from MoinMoin.i18n import languages Dependencies = ['pages'] @@ -43,21 +44,30 @@ def advanced_ui(macro): ) ]) + langs = dict([(lang, lmeta['x-language-in-english']) + for lang, lmeta in languages.iteritems()]) + lang_dropdown = ''.join([ + u'', + ]) + search_options = ''.join([ ''.join([ f.table_row(1), f.table_cell(1, colspan=2), - f.text(_(txt)), + txt, f.table_cell(0), f.table_row(0), - ]) for txt in ('Language', 'xxxx') + ]) for txt in ('Language:' + lang_dropdown, 'xxxx') ]) html = [ u'
', u'
', u'', - u'' % 0, + u'', f.table(1), search_boxes, search_options, # HG changeset patch # User Franz Pletz # Date 1155899577 -7200 # Node ID 949341c1c5ed0971aa7e2089ade84a36e9185fb5 # Parent 4c29aeea8bf713244216f0633016ac03ebd4432f index author und revision number diff -r 4c29aeea8bf7 -r 949341c1c5ed MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Fri Aug 18 12:56:41 2006 +0200 +++ b/MoinMoin/search/Xapian.py Fri Aug 18 13:12:57 2006 +0200 @@ -152,7 +152,7 @@ class Index(BaseIndex): # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt 'author': 'A', 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest - #G newsGroup (or similar entity - e.g. a web forum name) + #G newsGroup (or sim2006-08-17 05:11:53ilar entity - e.g. a web forum name) 'hostname': 'H', 'keyword': 'K', 'lang': 'L', # ISO Language code @@ -174,6 +174,7 @@ class Index(BaseIndex): 'category': 'XCAT', # category this document belongs to 'full_title': 'XFT', # full title (for regex) 'domain': 'XDOMAIN', # standard or underlay + 'revision': 'XREV', # revision of page #Y year (four digits) } @@ -364,6 +365,8 @@ class Index(BaseIndex): pagename = page.page_name mtime = page.mtime_usecs() itemid = "%s:%s" % (wikiname, pagename) + revision = str(page.get_real_rev()) + author = page.last_edit(request)['editor'] # XXX: Hack until we get proper metadata language, stem_language = self._get_languages(page) categories = self._get_categories(page) @@ -397,7 +400,9 @@ class Index(BaseIndex): xkeywords = [xapdoc.Keyword('itemid', itemid), xapdoc.Keyword('lang', language), xapdoc.Keyword('stem_lang', stem_language), - xapdoc.Keyword('full_title', pagename.lower())] + xapdoc.Keyword('full_title', pagename.lower()), + xapdoc.Keyword('revision', revision), + xapdoc.Keyword('author', author)] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) for category in categories: # HG changeset patch # User Franz Pletz # Date 1155903908 -7200 # Node ID 0a291dcb05552f34a62a01907d4465b3be3935a7 # Parent 949341c1c5ed0971aa7e2089ade84a36e9185fb5 more work on the advanced ui.. css, mimetype and language stuff diff -r 949341c1c5ed -r 0a291dcb0555 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Fri Aug 18 13:12:57 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Fri Aug 18 14:25:08 2006 +0200 @@ -23,14 +23,20 @@ def isTitleSearch(request): try: return int(request.form['titlesearch'][0]) except ValueError: - return True + return not isAdvancedSearch(request) except KeyError: return 'fullsearch' not in request.form +def isAdvancedSearch(request): + try: + return int(request.form['advancedsearch'][0]) + except ValueError: + return False def execute(pagename, request, fieldname='value', titlesearch=0): _ = request.getText titlesearch = isTitleSearch(request) + advancedsearch = isAdvancedSearch(request) # context is relevant only for full search if titlesearch: diff -r 949341c1c5ed -r 0a291dcb0555 MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 13:12:57 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:25:08 2006 +0200 @@ -20,7 +20,11 @@ def advanced_ui(macro): f = macro.formatter search_boxes = ''.join([ - ''.join([ + f.table_row(1), + f.table_cell(1, attrs={'rowspan': '7', 'class': 'searchfor'}), + f.text(_('Search for pages')), + f.table_cell(0), + ''.join([''.join([ f.table_row(1), f.table_cell(1), f.text('%s:' % _(txt)), @@ -30,49 +34,71 @@ def advanced_ui(macro): f.table_cell(0), f.table_row(0), ]) for txt, input_field in ( - ('Search for pages containing all the following terms', + ('containing all the following terms', ''), - ('Search for pages containing one or more of the following ' + ('containing one or more of the following ' 'terms', ''), - ('Search for pages not containing the following terms', + ('not containing the following terms', ''), - ('Search for pages containing only one of the following terms', + ('containing only one of the following terms', ''), # TODO: dropdown-box? - ('Search for pages belonging to one of the following categories', + ('belonging to one of the following categories', ''), - ) + ('edited since/until the following date', + ''), + )]) ]) langs = dict([(lang, lmeta['x-language-in-english']) - for lang, lmeta in languages.iteritems()]) + for lang, lmeta in sorted(languages.items())]) lang_dropdown = ''.join([ u'', + ]) + + import mimetypes + ft_dropdown = ''.join([ + u'', ]) search_options = ''.join([ ''.join([ f.table_row(1), - f.table_cell(1, colspan=2), + f.table_cell(1, colspan=3), txt, f.table_cell(0), f.table_row(0), - ]) for txt in ('Language:' + lang_dropdown, 'xxxx') + ]) for txt in ( + 'Language: ' + lang_dropdown, + 'File Type: ' + ft_dropdown, + '%s' % + _('Search only in titles'), + '%s' % + _('Case-sensitive search')) ]) html = [ u'', u'
', u'', - u'', - f.table(1), + u'', + f.table(1, attrs={'tableclass': 'advancedsearch'}), search_boxes, search_options, + f.table_row(1), + f.table_cell(1, attrs={'class': 'submit', 'colspan': '3'}), + u'' % _('Go get it!'), + f.table_cell(0), + f.table_row(0), f.table(0), - u'' % _('Go get it!'), u'
', u'', ] diff -r 949341c1c5ed -r 0a291dcb0555 wiki/htdocs/modern/css/common.css --- a/wiki/htdocs/modern/css/common.css Fri Aug 18 13:12:57 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Fri Aug 18 14:25:08 2006 +0200 @@ -331,6 +331,35 @@ div.codearea pre span.DiffSeparator {col div.codearea pre span.DiffSeparator {color: #228B22; font-weight: bold} /* Search results */ +.advancedsearch { + border: 1pt solid #ADB9CC; +} + +.advancedsearch td { + vertical-align: top; + background-color: #E7E7E7; + border: 0px; +} + +.advancedsearch td.searchfor { + font-weight: bold; +} +.advancedsearch input { + border: 1px solid #ADB9CC; + background-color: #fff; +} + +.advancedsearch td.submit { + border-top: 1px solid #ADB9CC; + background-color: #fff; + text-align: right; +} + +.advancedsearch optioni, select { + border: 1px solid #ADB9CC; + background-color: #fff; +} + .searchresults dt { margin-top: 1em; # HG changeset patch # User Franz Pletz # Date 1155904466 -7200 # Node ID 25ad5f5c6e3df979145fa11f49323c24b5cae72a # Parent 0a291dcb05552f34a62a01907d4465b3be3935a7 py2.4 fix, sorted diff -r 0a291dcb0555 -r 25ad5f5c6e3d MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:25:08 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:34:26 2006 +0200 @@ -14,6 +14,14 @@ from MoinMoin.i18n import languages from MoinMoin.i18n import languages Dependencies = ['pages'] + +try: + sorted +except NameError: + def sorted(l, *args, **kw): + l = l[:] + l.sort(*args, *kw) + return l def advanced_ui(macro): _ = macro._ # HG changeset patch # User Franz Pletz # Date 1155904563 -7200 # Node ID 640e007672ed3b3e0fb99f5ccbc33207fa529b89 # Parent 25ad5f5c6e3df979145fa11f49323c24b5cae72a small typo diff -r 25ad5f5c6e3d -r 640e007672ed MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:34:26 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:36:03 2006 +0200 @@ -20,7 +20,7 @@ except NameError: except NameError: def sorted(l, *args, **kw): l = l[:] - l.sort(*args, *kw) + l.sort(*args, **kw) return l def advanced_ui(macro): # HG changeset patch # User Franz Pletz # Date 1155904907 -7200 # Node ID 6dacf20611242c8d26263d6fdfee4f9401245ce2 # Parent 640e007672ed3b3e0fb99f5ccbc33207fa529b89 cosmetic change for language and file type selection diff -r 640e007672ed -r 6dacf2061124 MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:36:03 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:41:47 2006 +0200 @@ -35,7 +35,7 @@ def advanced_ui(macro): ''.join([''.join([ f.table_row(1), f.table_cell(1), - f.text('%s:' % _(txt)), + f.text(_(txt)), f.table_cell(0), f.table_cell(1), f.rawHTML(input_field), @@ -80,17 +80,20 @@ def advanced_ui(macro): search_options = ''.join([ ''.join([ f.table_row(1), - f.table_cell(1, colspan=3), - txt, + f.table_cell(1, attrs={'class': 'searchfor'}), + txt[0], + f.table_cell(0), + f.table_cell(1, colspan=2), + txt[1], f.table_cell(0), f.table_row(0), ]) for txt in ( - 'Language: ' + lang_dropdown, - 'File Type: ' + ft_dropdown, - '%s' % - _('Search only in titles'), - '%s' % - _('Case-sensitive search')) + ('Language', lang_dropdown), + ('File Type', ft_dropdown), + ('', '%s' % + _('Search only in titles')), + ('', '%s' % + _('Case-sensitive search'))) ]) html = [ # HG changeset patch # User Franz Pletz # Date 1155917202 -7200 # Node ID 90cb8fe71cdf304be2f85999f3f2a5b2550a8314 # Parent 6dacf20611242c8d26263d6fdfee4f9401245ce2 get fullsearch action working with the advanced search dialogue diff -r 6dacf2061124 -r 90cb8fe71cdf MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Fri Aug 18 14:41:47 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Fri Aug 18 18:06:42 2006 +0200 @@ -8,6 +8,7 @@ @license: GNU GPL, see COPYING for details. """ +import re from MoinMoin.Page import Page from MoinMoin import wikiutil @@ -23,14 +24,15 @@ def isTitleSearch(request): try: return int(request.form['titlesearch'][0]) except ValueError: - return not isAdvancedSearch(request) + return True except KeyError: - return 'fullsearch' not in request.form + return 'fullsearch' not in request.form and \ + not isAdvancedSearch(request) def isAdvancedSearch(request): try: return int(request.form['advancedsearch'][0]) - except ValueError: + except KeyError: return False def execute(pagename, request, fieldname='value', titlesearch=0): @@ -41,6 +43,9 @@ def execute(pagename, request, fieldname # context is relevant only for full search if titlesearch: context = 0 + elif advancedsearch: + # XXX: hardcoded + context = 180 else: context = int(request.form.get('context', [0])[0]) @@ -52,6 +57,30 @@ def execute(pagename, request, fieldname max_context = 1 # only show first `max_context` contexts XXX still unused + if advancedsearch: + and_terms = request.form.get('and_terms', [''])[0].strip() + or_terms = request.form.get('or_terms', [''])[0].strip() + not_terms = request.form.get('not_terms', [''])[0].strip() + #xor_terms = request.form.get('xor_terms', [''])[0].strip() + categories = request.form.get('categories', [''])[0].strip() + timeframe = request.form.get('time', [''])[0].strip() + language = request.form.get('language', + [request.cfg.language_default])[0] + + word_re = re.compile(r'(\"[\w\s]+"|\w+)') + needle = '' + if language: + needle += 'language:%s ' % language + if categories: + needle += '(%s) ' % ' or '.join(['category:%s' % cat + for cat in word_re.findall(categories)]) + if and_terms: + needle += '(%s) ' % and_terms + if not_terms: + needle += '(%s) ' % ' '.join(['-%s' % t for t in word_re.findall(not_terms)]) + if or_terms: + needle += '(%s) ' % ' or '.join(word_re.findall(or_terms)) + # check for sensible search term striped = needle.strip() if len(striped) == 0: @@ -60,6 +89,7 @@ def execute(pagename, request, fieldname request.emit_http_headers() Page(request, pagename).send_page(request, msg=err) return + needle = striped # Setup for type of search if titlesearch: diff -r 6dacf2061124 -r 90cb8fe71cdf MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 14:41:47 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 18:06:42 2006 +0200 @@ -29,7 +29,7 @@ def advanced_ui(macro): search_boxes = ''.join([ f.table_row(1), - f.table_cell(1, attrs={'rowspan': '7', 'class': 'searchfor'}), + f.table_cell(1, attrs={'rowspan': '6', 'class': 'searchfor'}), f.text(_('Search for pages')), f.table_cell(0), ''.join([''.join([ @@ -43,18 +43,18 @@ def advanced_ui(macro): f.table_row(0), ]) for txt, input_field in ( ('containing all the following terms', - ''), + ''), ('containing one or more of the following ' 'terms', ''), ('not containing the following terms', ''), - ('containing only one of the following terms', - ''), + #('containing only one of the following terms', + # ''), # TODO: dropdown-box? ('belonging to one of the following categories', ''), - ('edited since/until the following date', - ''), + ('edited in the the following timeframe (XXX)', + ''), )]) ]) @@ -70,7 +70,7 @@ def advanced_ui(macro): import mimetypes ft_dropdown = ''.join([ - u'', u'' % _('any type'), ''.join(['' % (m[1], '*%s - %s' % m) for m in sorted(mimetypes.types_map.items())]), @@ -89,7 +89,7 @@ def advanced_ui(macro): f.table_row(0), ]) for txt in ( ('Language', lang_dropdown), - ('File Type', ft_dropdown), + ('File Type (XXX)', ft_dropdown), ('', '%s' % _('Search only in titles')), ('', '%s' % # HG changeset patch # User Franz Pletz # Date 1155985844 -7200 # Node ID aa33bb2b40d66de7c2ebf2e0eb17bee5d1955bed # Parent 90cb8fe71cdf304be2f85999f3f2a5b2550a8314 mimetype-search is live, i18n fixes diff -r 90cb8fe71cdf -r aa33bb2b40d6 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Fri Aug 18 18:06:42 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Sat Aug 19 13:10:44 2006 +0200 @@ -66,11 +66,14 @@ def execute(pagename, request, fieldname timeframe = request.form.get('time', [''])[0].strip() language = request.form.get('language', [request.cfg.language_default])[0] + mimetype = request.form.get('mimetype', [0])[0] word_re = re.compile(r'(\"[\w\s]+"|\w+)') needle = '' if language: needle += 'language:%s ' % language + if mimetype: + needle += 'mimetype:%s ' % mimetype if categories: needle += '(%s) ' % ' or '.join(['category:%s' % cat for cat in word_re.findall(categories)]) diff -r 90cb8fe71cdf -r aa33bb2b40d6 MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Fri Aug 18 18:06:42 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Sat Aug 19 13:10:44 2006 +0200 @@ -12,6 +12,8 @@ from MoinMoin import config, wikiutil, search from MoinMoin.i18n import languages + +import mimetypes Dependencies = ['pages'] @@ -42,18 +44,18 @@ def advanced_ui(macro): f.table_cell(0), f.table_row(0), ]) for txt, input_field in ( - ('containing all the following terms', + (_('containing all the following terms'), ''), - ('containing one or more of the following ' - 'terms', ''), - ('not containing the following terms', + (_('containing one or more of the following terms'), + ''), + (_('not containing the following terms'), ''), #('containing only one of the following terms', # ''), # TODO: dropdown-box? - ('belonging to one of the following categories', + (_('belonging to one of the following categories'), ''), - ('edited in the the following timeframe (XXX)', + (_('edited in the the following timeframe (XXX)'), ''), )]) ]) @@ -68,7 +70,6 @@ def advanced_ui(macro): u'', ]) - import mimetypes ft_dropdown = ''.join([ u'%s' % _('Search only in titles')), ('', '%s' % diff -r 90cb8fe71cdf -r aa33bb2b40d6 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Fri Aug 18 18:06:42 2006 +0200 +++ b/MoinMoin/search/Xapian.py Sat Aug 19 13:10:44 2006 +0200 @@ -457,11 +457,14 @@ class Index(BaseIndex): xlanguage = xapdoc.Keyword('lang', language) xstem_language = xapdoc.Keyword('stem_lang', stem_language) mimetype, att_content = self.contentfilter(filename) - xmimetype = xapdoc.TextField('mimetype', mimetype, True) + xmimetype = xapdoc.Keyword('mimetype', mimetype) xcontent = xapdoc.TextField('content', att_content) - doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ), - sortFields=(xpname, xattachment, xmtime, xwname, ), + doc = xapdoc.Document(textFields=(xcontent, ), + keywords=(xatt_itemid, xtitle, + xlanguage, xstem_language, + xmimetype), + sortFields=(xpname, xattachment, xmtime, + xwname, ), ) doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language) diff -r 90cb8fe71cdf -r aa33bb2b40d6 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Fri Aug 18 18:06:42 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sat Aug 19 13:10:44 2006 +0200 @@ -694,6 +694,70 @@ class CategorySearch(TextSearch): else: pattern = self._pattern.lower() return UnicodeQuery('%s:%s' % (prefix, pattern)) + + +class MimetypeSearch(BaseExpression): + """ Search for files beloging to a specific mimetype """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a mimetype search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = False # not case-sensitive! + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return [] + else: + return [Match()] + + def xapian_wanted(self): + return True # only easy regexps possible + + def xapian_need_postproc(self): + return False # case-sensitivity would make no sense + + def xapian_term(self, request, allterms): + self.xapian_called = True + prefix = Xapian.Index.prefixMap['mimetype'] + if self.use_re: + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n:]): + terms.append(term) + elif found: + continue + + if not terms: + return Query() + return Query(Query.OP_OR, terms) + else: + pattern = self._pattern + return UnicodeQuery('%s%s' % (prefix, pattern)) ############################################################################## @@ -782,6 +846,7 @@ class QueryParser: linkto = False lang = False category = False + mimetype = False for m in modifiers: if "title".startswith(m): @@ -796,6 +861,8 @@ class QueryParser: lang = True elif "category".startswith(m): category = True + elif "mimetype".startswith(m): + mimetype = True # oh, let's better call xapian if we encouter this nasty regexp ;) if not category: @@ -808,6 +875,8 @@ class QueryParser: if category: obj = CategorySearch(text, use_re=regex, case=case) + elif mimetype: + obj = MimetypeSearch(text, use_re=regex, case=False) elif lang: obj = LanguageSearch(text, use_re=regex, case=False) elif linkto: # HG changeset patch # User Franz Pletz # Date 1155986264 -7200 # Node ID d5741f2b82920737febe03a1183e93c05b3553c4 # Parent aa33bb2b40d66de7c2ebf2e0eb17bee5d1955bed update CHANGES.fpletz diff -r aa33bb2b40d6 -r d5741f2b8292 docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Aug 19 13:10:44 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Aug 19 13:17:44 2006 +0200 @@ -8,12 +8,13 @@ Branch moin/1.6-xapian-fpletz metadata) ToDo: - * Implement the new search UI * Write/update documentation for all the new search stuff + * Search based on mtime + * Index all revisions and let users search in them (rev, mtime) + + ToDo (low priority): * Reevaluate Xapwrap, possibly drop it and rip out usable stuff (i.e. ExceptionTranslator) - - ToDo (low priority): * Case-sensitive searches / Regexp on multiple terms: Graceful fallback to and/or merge with moinSearch based on nodes xapian can handle in the search term tree @@ -23,10 +24,11 @@ Branch moin/1.6-xapian-fpletz New Features: * Faster search thanks to Xapian - * Searching for languages with new prefix lang/language, i.e. lang:de + * New searches: + - LanguageSearch: language:de + - CategorySearch: category:Homepage + - MimetypeSearch: mimetype:image/png (for attachments/files) Note: Currently only available when Xapian is used - * CategorySearch with prefix category or with the regexp previously - used (autodetected as CategorySearch) * New config options: xapian_search 0 enables xapian-powered search xapian_index_dir None directory for xapian indices @@ -241,4 +243,15 @@ 2006-08-15 2006-08-16 no work on project - +2006-08-17 + * started advanced gui, new macro: AdvancedSearch + +2006-08-18 + * eye-candy for advanced gui + * reworked fullsearch action to work with AdvancedSearch and most of + the + +2006-08-19 + * mimetype search works (more or less) + * minor bugfixes (i18n etc.) + # HG changeset patch # User Franz Pletz # Date 1155987349 -7200 # Node ID 649b60a4064a023911e3d9f57871cbeb88e8cdc7 # Parent d5741f2b82920737febe03a1183e93c05b3553c4 domain-secific search (for system pages) diff -r d5741f2b8292 -r 649b60a4064a MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Sat Aug 19 13:17:44 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Sat Aug 19 13:35:49 2006 +0200 @@ -67,6 +67,7 @@ def execute(pagename, request, fieldname language = request.form.get('language', [request.cfg.language_default])[0] mimetype = request.form.get('mimetype', [0])[0] + underlay = request.form.get('underlay', [0])[0] word_re = re.compile(r'(\"[\w\s]+"|\w+)') needle = '' @@ -74,6 +75,8 @@ def execute(pagename, request, fieldname needle += 'language:%s ' % language if mimetype: needle += 'mimetype:%s ' % mimetype + if underlay: + needle += 'domain:underlay ' if categories: needle += '(%s) ' % ' or '.join(['category:%s' % cat for cat in word_re.findall(categories)]) diff -r d5741f2b8292 -r 649b60a4064a MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Sat Aug 19 13:17:44 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Sat Aug 19 13:35:49 2006 +0200 @@ -94,7 +94,10 @@ def advanced_ui(macro): ('', '%s' % _('Search only in titles')), ('', '%s' % - _('Case-sensitive search'))) + _('Case-sensitive search')), + ('', '%s' + '' % _('Only system pages')), + ) ]) html = [ diff -r d5741f2b8292 -r 649b60a4064a MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Sat Aug 19 13:17:44 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sat Aug 19 13:35:49 2006 +0200 @@ -697,7 +697,7 @@ class CategorySearch(TextSearch): class MimetypeSearch(BaseExpression): - """ Search for files beloging to a specific mimetype """ + """ Search for files belonging to a specific mimetype """ def __init__(self, pattern, use_re=False, case=True): """ Init a mimetype search @@ -758,6 +758,72 @@ class MimetypeSearch(BaseExpression): else: pattern = self._pattern return UnicodeQuery('%s%s' % (prefix, pattern)) + + +class DomainSearch(BaseExpression): + """ Search for pages belonging to a specific domain """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a mimetype search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = False # not case-sensitive! + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return [] + else: + return [Match()] + + def xapian_wanted(self): + return True # only easy regexps possible + + def xapian_need_postproc(self): + return False # case-sensitivity would make no sense + + def xapian_term(self, request, allterms): + self.xapian_called = True + prefix = Xapian.Index.prefixMap['domain'] + if self.use_re: + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n+1:]): + terms.append(term) + elif found: + continue + + if not terms: + return Query() + return Query(Query.OP_OR, terms) + else: + pattern = self._pattern + return UnicodeQuery('%s:%s' % (prefix, pattern)) + + ############################################################################## @@ -847,6 +913,7 @@ class QueryParser: lang = False category = False mimetype = False + domain = False for m in modifiers: if "title".startswith(m): @@ -863,6 +930,8 @@ class QueryParser: category = True elif "mimetype".startswith(m): mimetype = True + elif "domain".startswith(m): + domain = True # oh, let's better call xapian if we encouter this nasty regexp ;) if not category: @@ -881,6 +950,8 @@ class QueryParser: obj = LanguageSearch(text, use_re=regex, case=False) elif linkto: obj = LinkSearch(text, use_re=regex, case=case) + elif domain: + obj = DomainSearch(text, use_re=regex, case=False) elif title_search: obj = TitleSearch(text, use_re=regex, case=case) else: diff -r d5741f2b8292 -r 649b60a4064a docs/CHANGES.fpletz --- a/docs/CHANGES.fpletz Sat Aug 19 13:17:44 2006 +0200 +++ b/docs/CHANGES.fpletz Sat Aug 19 13:35:49 2006 +0200 @@ -28,6 +28,7 @@ Branch moin/1.6-xapian-fpletz - LanguageSearch: language:de - CategorySearch: category:Homepage - MimetypeSearch: mimetype:image/png (for attachments/files) + - DomainSearch: domain:underlay Note: Currently only available when Xapian is used * New config options: xapian_search 0 enables xapian-powered search @@ -254,4 +255,5 @@ 2006-08-19 2006-08-19 * mimetype search works (more or less) * minor bugfixes (i18n etc.) - + * domain-specific search (underlay -> system pages) + # HG changeset patch # User Franz Pletz # Date 1156025665 -7200 # Node ID bb37beca754503b688db996592e5147793cd3aee # Parent 649b60a4064a023911e3d9f57871cbeb88e8cdc7 fixed system pages search, added underlay search, started with mtime filtering diff -r 649b60a4064a -r bb37beca7545 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Sat Aug 19 13:35:49 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Sun Aug 20 00:14:25 2006 +0200 @@ -67,7 +67,9 @@ def execute(pagename, request, fieldname language = request.form.get('language', [request.cfg.language_default])[0] mimetype = request.form.get('mimetype', [0])[0] - underlay = request.form.get('underlay', [0])[0] + includeunderlay = request.form.get('includeunderlay', [0])[0] + onlysystempages = request.form.get('onlysystempages', [0])[0] + mtime = request.form.get('mtime', [''])[0] word_re = re.compile(r'(\"[\w\s]+"|\w+)') needle = '' @@ -75,8 +77,12 @@ def execute(pagename, request, fieldname needle += 'language:%s ' % language if mimetype: needle += 'mimetype:%s ' % mimetype - if underlay: - needle += 'domain:underlay ' + if not includeunderlay: + needle += '-domain:underlay ' + if onlysystempages: + needle += 'domain:system ' + if mtime: + needle += 'lastmodifiedsince:%s ' % mtime if categories: needle += '(%s) ' % ' or '.join(['category:%s' % cat for cat in word_re.findall(categories)]) diff -r 649b60a4064a -r bb37beca7545 MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Sat Aug 19 13:35:49 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Sun Aug 20 00:14:25 2006 +0200 @@ -55,8 +55,8 @@ def advanced_ui(macro): # TODO: dropdown-box? (_('belonging to one of the following categories'), ''), - (_('edited in the the following timeframe (XXX)'), - ''), + (_('last modified since (XXX)'), + ''), )]) ]) @@ -95,7 +95,9 @@ def advanced_ui(macro): _('Search only in titles')), ('', '%s' % _('Case-sensitive search')), - ('', '%s' + ('', '%s' + '' % _('Include underlay')), + ('', '%s' '' % _('Only system pages')), ) ]) diff -r 649b60a4064a -r bb37beca7545 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sat Aug 19 13:35:49 2006 +0200 +++ b/MoinMoin/search/Xapian.py Sun Aug 20 00:14:25 2006 +0200 @@ -351,6 +351,8 @@ class Index(BaseIndex): yield 'underlay' if page.isStandardPage(): yield 'standard' + if wikiutil.isSystemPage(self.request, page.page_name): + yield 'system' def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @@ -402,7 +404,8 @@ class Index(BaseIndex): xapdoc.Keyword('stem_lang', stem_language), xapdoc.Keyword('full_title', pagename.lower()), xapdoc.Keyword('revision', revision), - xapdoc.Keyword('author', author)] + xapdoc.Keyword('author', author), + )] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) for category in categories: @@ -462,7 +465,7 @@ class Index(BaseIndex): doc = xapdoc.Document(textFields=(xcontent, ), keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, - xmimetype), + xmimetype, ), sortFields=(xpname, xattachment, xmtime, xwname, ), ) diff -r 649b60a4064a -r bb37beca7545 MoinMoin/search/queryparser.py --- a/MoinMoin/search/queryparser.py Sat Aug 19 13:35:49 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sun Aug 20 00:14:25 2006 +0200 @@ -764,7 +764,7 @@ class DomainSearch(BaseExpression): """ Search for pages belonging to a specific domain """ def __init__(self, pattern, use_re=False, case=True): - """ Init a mimetype search + """ Init a domain search @param pattern: pattern to search for, ascii string or unicode @param use_re: treat pattern as re of plain text, bool @@ -822,8 +822,6 @@ class DomainSearch(BaseExpression): else: pattern = self._pattern return UnicodeQuery('%s:%s' % (prefix, pattern)) - - ############################################################################## # HG changeset patch # User Franz Pletz # Date 1156120205 -7200 # Node ID 6b0ea72d766504704410e83c5f5604b291343699 # Parent fa0b7d2d998b8d294d26e1172c16e608ff4b9f41 mtime search works, added MoinMoin.support.parsedatetime, small fixes diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/action/fullsearch.py --- a/MoinMoin/action/fullsearch.py Sun Aug 20 02:05:35 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Mon Aug 21 02:30:05 2006 +0200 @@ -8,9 +8,10 @@ @license: GNU GPL, see COPYING for details. """ -import re +import re, time from MoinMoin.Page import Page from MoinMoin import wikiutil +from MoinMoin.support.parsedatetime.parsedatetime import Calendar def isTitleSearch(request): @@ -54,6 +55,8 @@ def execute(pagename, request, fieldname case = int(request.form.get('case', [0])[0]) regex = int(request.form.get('regex', [0])[0]) # no interface currently hitsFrom = int(request.form.get('from', [0])[0]) + mtime = None + msg = '' max_context = 1 # only show first `max_context` contexts XXX still unused @@ -69,7 +72,19 @@ def execute(pagename, request, fieldname mimetype = request.form.get('mimetype', [0])[0] includeunderlay = request.form.get('includeunderlay', [0])[0] onlysystempages = request.form.get('onlysystempages', [0])[0] + mtime = request.form.get('mtime', [''])[0] + if mtime: + cal = Calendar() + mtime_parsed = cal.parse(mtime) + + if mtime_parsed[1] == 0 and mtime_parsed[0] <= time.localtime(): + mtime = time.mktime(mtime_parsed[0]) + else: + msg = _('The modification date you entered was not recognized ' + 'and is therefore not considered for the search ' + 'results!') + mtime = None word_re = re.compile(r'(\"[\w\s]+"|\w+)') needle = '' @@ -81,8 +96,6 @@ def execute(pagename, request, fieldname needle += '-domain:underlay ' if onlysystempages: needle += 'domain:system ' - if mtime: - needle += 'lastmodifiedsince:%s ' % mtime if categories: needle += '(%s) ' % ' or '.join(['category:%s' % cat for cat in word_re.findall(categories)]) @@ -94,14 +107,14 @@ def execute(pagename, request, fieldname needle += '(%s) ' % ' or '.join(word_re.findall(or_terms)) # check for sensible search term - striped = needle.strip() - if len(striped) == 0: + stripped = needle.strip() + if len(stripped) == 0: err = _('Please use a more selective search term instead ' 'of {{{"%s"}}}') % needle request.emit_http_headers() Page(request, pagename).send_page(request, msg=err) return - needle = striped + needle = stripped # Setup for type of search if titlesearch: @@ -115,7 +128,7 @@ def execute(pagename, request, fieldname from MoinMoin.search import searchPages, QueryParser query = QueryParser(case=case, regex=regex, titlesearch=titlesearch).parse_query(needle) - results = searchPages(request, query, sort) + results = searchPages(request, query, sort, mtime) # directly show a single hit # XXX won't work with attachment search @@ -135,7 +148,8 @@ def execute(pagename, request, fieldname # This action generate data using the user language request.setContentLanguage(request.lang) - request.theme.send_title(title % needle, form=request.form, pagename=pagename) + request.theme.send_title(title % needle, form=request.form, + pagename=pagename, msg=msg) # Start content (important for RTL support) request.write(request.formatter.startContent("content")) diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/macro/AdvancedSearch.py --- a/MoinMoin/macro/AdvancedSearch.py Sun Aug 20 02:05:35 2006 +0200 +++ b/MoinMoin/macro/AdvancedSearch.py Mon Aug 21 02:30:05 2006 +0200 @@ -12,18 +12,12 @@ from MoinMoin import config, wikiutil, search from MoinMoin.i18n import languages +from MoinMoin.support import sorted import mimetypes Dependencies = ['pages'] -try: - sorted -except NameError: - def sorted(l, *args, **kw): - l = l[:] - l.sort(*args, **kw) - return l def advanced_ui(macro): _ = macro._ @@ -55,7 +49,7 @@ def advanced_ui(macro): # TODO: dropdown-box? (_('belonging to one of the following categories'), ''), - (_('last modified since (XXX)'), + (_('last modified since'), ''), )]) ]) diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/search/Xapian.py --- a/MoinMoin/search/Xapian.py Sun Aug 20 02:05:35 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Aug 21 02:30:05 2006 +0200 @@ -405,7 +405,7 @@ class Index(BaseIndex): xapdoc.Keyword('full_title', pagename.lower()), xapdoc.Keyword('revision', revision), xapdoc.Keyword('author', author), - )] + ] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) for category in categories: @@ -516,33 +516,3 @@ class Index(BaseIndex): finally: writer.__del__() -def run_query(query, db): - enquire = xapian.Enquire(db) - parser = xapian.QueryParser() - query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD) - print query.get_description() - enquire.set_query(query) - return enquire.get_mset(0, 10) - -def run(request): - pass - #print "Begin" - #db = xapian.WritableDatabase(xapian.open('test.db', - # xapian.DB_CREATE_OR_OPEN)) - # - # index_data(db) ??? - #del db - #mset = run_query(sys.argv[1], db) - #print mset.get_matches_estimated() - #iterator = mset.begin() - #while iterator != mset.end(): - # print iterator.get_document().get_data() - # iterator.next() - #for i in xrange(1,170): - # doc = db.get_document(i) - # print doc.get_data() - -if __name__ == '__main__': - run() - - diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/search/__init__.py --- a/MoinMoin/search/__init__.py Sun Aug 20 02:05:35 2006 +0200 +++ b/MoinMoin/search/__init__.py Mon Aug 21 02:30:05 2006 +0200 @@ -13,7 +13,7 @@ from MoinMoin.search.queryparser import from MoinMoin.search.queryparser import QueryParser from MoinMoin.search.builtin import Search -def searchPages(request, query, sort='weight', **kw): +def searchPages(request, query, sort='weight', mtime=None, **kw): """ Search the text of all pages for query. @param request: current request @@ -23,5 +23,5 @@ def searchPages(request, query, sort='we """ if isinstance(query, str) or isinstance(query, unicode): query = QueryParser(**kw).parse_query(query) - return Search(request, query, sort).run() + return Search(request, query, sort, mtime=mtime).run() diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/search/builtin.py --- a/MoinMoin/search/builtin.py Sun Aug 20 02:05:35 2006 +0200 +++ b/MoinMoin/search/builtin.py Mon Aug 21 02:30:05 2006 +0200 @@ -352,10 +352,11 @@ class Search: class Search: """ A search run """ - def __init__(self, request, query, sort='weight'): + def __init__(self, request, query, sort='weight', mtime=None): self.request = request self.query = query self.sort = sort + self.mtime = mtime self.filtered = False self.fs_rootpage = "FS" # XXX FS hardcoded @@ -552,9 +553,12 @@ class Search: userMayRead = self.request.user.may.read fs_rootpage = self.fs_rootpage + "/" thiswiki = (self.request.cfg.interwikiname, 'Self') - filtered = [(wikiname, page, attachment, match) for wikiname, page, attachment, match in hits - if not wikiname in thiswiki or + filtered = [(wikiname, page, attachment, match) + for wikiname, page, attachment, match in hits + if (not wikiname in thiswiki or page.exists() and userMayRead(page.page_name) or - page.page_name.startswith(fs_rootpage)] + page.page_name.startswith(fs_rootpage)) and + (not self.mtime or + self.mtime <= page.mtime_usecs()/1000000)] return filtered diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/support/__init__.py --- a/MoinMoin/support/__init__.py Sun Aug 20 02:05:35 2006 +0200 +++ b/MoinMoin/support/__init__.py Mon Aug 21 02:30:05 2006 +0200 @@ -10,3 +10,12 @@ @copyright: 2001-2004 by Jürgen Hermann @license: GNU GPL, see COPYING for details. """ + +try: + sorted = sorted +except NameError: + def sorted(l, *args, **kw): + l = l[:] + l.sort(*args, **kw) + return l + diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/support/parsedatetime/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/parsedatetime/__init__.py Mon Aug 21 02:30:05 2006 +0200 @@ -0,0 +1,17 @@ +version = '0.6.4' +author = 'Mike Taylor ' +license = """Copyright (c) 2004-2006 Mike Taylor, All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/support/parsedatetime/parsedatetime.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/parsedatetime/parsedatetime.py Mon Aug 21 02:30:05 2006 +0200 @@ -0,0 +1,1112 @@ +#!/usr/bin/env python + +""" +Parse human-readable date/time text. +""" + +__license__ = """Copyright (c) 2004-2006 Mike Taylor, All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +__author__ = 'Mike Taylor ' +__contributors__ = ['Darshana Chhajed ', + ] + +_debug = False + + +import string, re, time +import datetime, calendar, rfc822 +import parsedatetime_consts + + +# Copied from feedparser.py +# Universal Feedparser, Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. +# Originally a def inside of _parse_date_w3dtf() +def _extract_date(m): + year = int(m.group('year')) + if year < 100: + year = 100 * int(time.gmtime()[0] / 100) + int(year) + if year < 1000: + return 0, 0, 0 + julian = m.group('julian') + if julian: + julian = int(julian) + month = julian / 30 + 1 + day = julian % 30 + 1 + jday = None + while jday != julian: + t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) + jday = time.gmtime(t)[-2] + diff = abs(jday - julian) + if jday > julian: + if diff < day: + day = day - diff + else: + month = month - 1 + day = 31 + elif jday < julian: + if day + diff < 28: + day = day + diff + else: + month = month + 1 + return year, month, day + month = m.group('month') + day = 1 + if month is None: + month = 1 + else: + month = int(month) + day = m.group('day') + if day: + day = int(day) + else: + day = 1 + return year, month, day + +# Copied from feedparser.py +# Universal Feedparser, Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. +# Originally a def inside of _parse_date_w3dtf() +def _extract_time(m): + if not m: + return 0, 0, 0 + hours = m.group('hours') + if not hours: + return 0, 0, 0 + hours = int(hours) + minutes = int(m.group('minutes')) + seconds = m.group('seconds') + if seconds: + seconds = int(seconds) + else: + seconds = 0 + return hours, minutes, seconds + + +# Copied from feedparser.py +# Universal Feedparser, Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. +# Modified to return a tuple instead of mktime +# +# Original comment: +# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by +# Drake and licensed under the Python license. Removed all range checking +# for month, day, hour, minute, and second, since mktime will normalize +# these later +def _parse_date_w3dtf(dateString): + # the __extract_date and __extract_time methods were + # copied-out so they could be used by my code --bear + def __extract_tzd(m): + '''Return the Time Zone Designator as an offset in seconds from UTC.''' + if not m: + return 0 + tzd = m.group('tzd') + if not tzd: + return 0 + if tzd == 'Z': + return 0 + hours = int(m.group('tzdhours')) + minutes = m.group('tzdminutes') + if minutes: + minutes = int(minutes) + else: + minutes = 0 + offset = (hours*60 + minutes) * 60 + if tzd[0] == '+': + return -offset + return offset + + __date_re = ('(?P\d\d\d\d)' + '(?:(?P-|)' + '(?:(?P\d\d\d)' + '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') + __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' + __tzd_rx = re.compile(__tzd_re) + __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' + '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' + + __tzd_re) + __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) + __datetime_rx = re.compile(__datetime_re) + m = __datetime_rx.match(dateString) + if (m is None) or (m.group() != dateString): return + return _extract_date(m) + _extract_time(m) + (0, 0, 0) + + +# Copied from feedparser.py +# Universal Feedparser, Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. +# Modified to return a tuple instead of mktime +# +def _parse_date_rfc822(dateString): + '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' + data = dateString.split() + if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: + del data[0] + if len(data) == 4: + s = data[3] + i = s.find('+') + if i > 0: + data[3:] = [s[:i], s[i+1:]] + else: + data.append('') + dateString = " ".join(data) + if len(data) < 5: + dateString += ' 00:00:00 GMT' + return rfc822.parsedate_tz(dateString) + +# rfc822.py defines several time zones, but we define some extra ones. +# 'ET' is equivalent to 'EST', etc. +_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} +rfc822._timezones.update(_additional_timezones) + + +class Calendar: + """ + A collection of routines to input, parse and manipulate date and times. + The text can either be 'normal' date values or it can be human readable. + """ + + def __init__(self, constants=None): + """ + Default constructor for the Calendar class. + + @type constants: object + @param constants: Instance of the class L{CalendarConstants} + + @rtype: object + @return: Calendar instance + """ + # if a constants reference is not included, use default + if constants is None: + self.ptc = parsedatetime_consts.CalendarConstants() + else: + self.ptc = constants + + self.CRE_SPECIAL = re.compile(self.ptc.RE_SPECIAL, re.IGNORECASE) + self.CRE_UNITS = re.compile(self.ptc.RE_UNITS, re.IGNORECASE) + self.CRE_QUNITS = re.compile(self.ptc.RE_QUNITS, re.IGNORECASE) + self.CRE_MODIFIER = re.compile(self.ptc.RE_MODIFIER, re.IGNORECASE) + self.CRE_MODIFIER2 = re.compile(self.ptc.RE_MODIFIER2, re.IGNORECASE) + self.CRE_TIMEHMS = re.compile(self.ptc.RE_TIMEHMS, re.IGNORECASE) + self.CRE_TIMEHMS2 = re.compile(self.ptc.RE_TIMEHMS2, re.IGNORECASE) + self.CRE_DATE = re.compile(self.ptc.RE_DATE, re.IGNORECASE) + self.CRE_DATE2 = re.compile(self.ptc.RE_DATE2, re.IGNORECASE) + self.CRE_DATE3 = re.compile(self.ptc.RE_DATE3, re.IGNORECASE) + self.CRE_MONTH = re.compile(self.ptc.RE_MONTH, re.IGNORECASE) + self.CRE_WEEKDAY = re.compile(self.ptc.RE_WEEKDAY, re.IGNORECASE) + self.CRE_DAY = re.compile(self.ptc.RE_DAY, re.IGNORECASE) + self.CRE_TIME = re.compile(self.ptc.RE_TIME, re.IGNORECASE) + self.CRE_REMAINING = re.compile(self.ptc.RE_REMAINING, re.IGNORECASE) + + self.invalidFlag = 0 # Is set if the datetime string entered cannot be parsed at all + self.weekdyFlag = 0 # monday/tuesday/... + self.dateStdFlag = 0 # 07/21/06 + self.dateStrFlag = 0 # July 21st, 2006 + self.timeFlag = 0 # 5:50 + self.meridianFlag = 0 # am/pm + self.dayStrFlag = 0 # tomorrow/yesterday/today/.. + self.timeStrFlag = 0 # lunch/noon/breakfast/... + self.modifierFlag = 0 # after/before/prev/next/.. + self.modifier2Flag = 0 # after/before/prev/next/.. + self.unitsFlag = 0 # hrs/weeks/yrs/min/.. + self.qunitsFlag = 0 # h/m/t/d.. + + + def _convertUnitAsWords(self, unitText): + """ + Converts text units into their number value + + Five = 5 + Twenty Five = 25 + Two hundred twenty five = 225 + Two thousand and twenty five = 2025 + Two thousand twenty five = 2025 + + @type unitText: string + @param unitText: number string + + @rtype: integer + @return: numerical value of unitText + """ + # TODO: implement this + pass + + + def _buildTime(self, source, quantity, modifier, units): + """ + Take quantity, modifier and unit strings and convert them into values. + Then calcuate the time and return the adjusted sourceTime + + @type source: time + @param source: time to use as the base (or source) + @type quantity: string + @param quantity: quantity string + @type modifier: string + @param modifier: how quantity and units modify the source time + @type units: string + @param units: unit of the quantity (i.e. hours, days, months, etc) + + @rtype: timetuple + @return: timetuple of the calculated time + """ + if _debug: + print '_buildTime: [%s][%s][%s]' % (quantity, modifier, units) + + if source is None: + source = time.localtime() + + if quantity is None: + quantity = '' + else: + quantity = string.strip(quantity) + + if len(quantity) == 0: + qty = 1 + else: + try: + qty = int(quantity) + except ValueError: + qty = 0 + + if modifier in self.ptc.Modifiers: + qty = qty * self.ptc.Modifiers[modifier] + + if units is None or units == '': + units = 'dy' + + # plurals are handled by regex's (could be a bug tho) + + if units in self.ptc.Units: + u = self.ptc.Units[units] + else: + u = 1 + + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = source + + start = datetime.datetime(yr, mth, dy, hr, mn, sec) + target = start + + if units.startswith('y'): + target = self.inc(start, year=qty) + elif units.endswith('th') or units.endswith('ths'): + target = self.inc(start, month=qty) + else: + if units.startswith('d'): + target = start + datetime.timedelta(days=qty) + elif units.startswith('h'): + target = start + datetime.timedelta(hours=qty) + elif units.startswith('m'): + target = start + datetime.timedelta(minutes=qty) + elif units.startswith('s'): + target = start + datetime.timedelta(seconds=qty) + elif units.startswith('w'): + target = start + datetime.timedelta(weeks=qty) + + if target != start: + self.invalidFlag = 0 + + return target.timetuple() + + + def parseDate(self, dateString): + """ + Parses strings like 05/28/200 or 04.21 + + @type dateString: string + @param dateString: text to convert to a datetime + + @rtype: datetime + @return: calculated datetime value of dateString + """ + yr, mth, dy, hr, mn, sec, wd, yd, isdst = time.localtime() + + # XXX: Quick fix to ignore 'the' + dateString = dateString.replace('the', '') + + s = dateString + m = self.CRE_DATE2.search(s) + if m is not None: + index = m.start() + mth = int(s[:index]) + s = s[index + 1:] + + m = self.CRE_DATE2.search(s) + if m is not None: + index = m.start() + dy = int(s[:index]) + yr = int(s[index + 1:]) + # TODO should this have a birthday epoch constraint? + if yr < 99: + yr += 2000 + else: + dy = int(string.strip(s)) + + if mth <= 12 and dy <= self.ptc.DaysInMonthList[mth - 1]: + sourceTime = (yr, mth, dy, hr, mn, sec, wd, yd, isdst) + else: + self.invalidFlag = 1 + sourceTime = time.localtime() #return current time if date string is invalid + + return sourceTime + + + def parseDateText(self, dateString): + """ + Parses strings like "May 31st, 2006" or "Jan 1st" or "July 2006" + + @type dateString: string + @param dateString: text to convert to a datetime + + @rtype: datetime + @return: calculated datetime value of dateString + """ + yr, mth, dy, hr, mn, sec, wd, yd, isdst = time.localtime() + + currentMth = mth + currentDy = dy + + s = dateString.lower() + m = self.CRE_DATE3.search(s) + mth = m.group('mthname') + mth = int(self.ptc.MthNames[mth]) + + if m.group('day') != None: + dy = int(m.group('day')) + else: + dy = 1 + + if m.group('year') != None: + yr = int(m.group('year')) + elif (mth < currentMth) or (mth == currentMth and dy < currentDy): + # if that day and month have already passed in this year, + # then increment the year by 1 + yr += 1 + + if dy <= self.ptc.DaysInMonthList[mth - 1]: + sourceTime = (yr, mth, dy, 9, 0, 0, wd, yd, isdst) + else: + # Return current time if date string is invalid + self.invalidFlag = 1 + sourceTime = time.localtime() + + return sourceTime + + + def _evalModifier(self, modifier, chunk1, chunk2, sourceTime): + """ + Evaluate the modifier string and following text (passed in + as chunk1 and chunk2) and if they match any known modifiers + calculate the delta and apply it to sourceTime + + @type modifier: string + @param modifier: modifier text to apply to sourceTime + @type chunk1: string + @param chunk1: first text chunk that followed modifier (if any) + @type chunk2: string + @param chunk2: second text chunk that followed modifier (if any) + @type sourceTime: datetime + @param sourceTime: datetime value to use as the base + + @rtype: tuple + @return: tuple of any remaining text and the modified sourceTime + """ + offset = self.ptc.Modifiers[modifier] + + if sourceTime is not None: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = sourceTime + else: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = time.localtime() + + # capture the units after the modifier and the remaining string after the unit + m = self.CRE_REMAINING.search(chunk2) + if m is not None: + index = m.start() + 1 + unit = chunk2[:m.start()] + chunk2 = chunk2[index:] + else: + unit = chunk2 + chunk2 = '' + + flag = 0 + + if unit == self.ptc.Target_Text['month'] or \ + unit == self.ptc.Target_Text['mth']: + if offset == 0: + dy = self.ptc.DaysInMonthList[mth - 1] + sourceTime = (yr, mth, dy, 9, 0, 0, wd, yd, isdst) + elif offset == 2: + # if day is the last day of the month, calculate the last day of the next month + if dy == self.ptc.DaysInMonthList[mth - 1]: + dy = self.ptc.DaysInMonthList[mth] + + start = datetime.datetime(yr, mth, dy, 9, 0, 0) + target = self.inc(start, month=1) + sourceTime = target.timetuple() + else: + start = datetime.datetime(yr, mth, 1, 9, 0, 0) + target = self.inc(start, month=offset) + sourceTime = target.timetuple() + + flag = 1 + + if unit == self.ptc.Target_Text['week'] or \ + unit == self.ptc.Target_Text['wk'] or \ + unit == self.ptc.Target_Text['w']: + if offset == 0: + start = datetime.datetime(yr, mth, dy, 17, 0, 0) + target = start + datetime.timedelta(days=(4 - wd)) + sourceTime = target.timetuple() + elif offset == 2: + start = datetime.datetime(yr, mth, dy, 9, 0, 0) + target = start + datetime.timedelta(days=7) + sourceTime = target.timetuple() + else: + return self._evalModifier(modifier, chunk1, "monday " + chunk2, sourceTime) + + flag = 1 + + if unit == self.ptc.Target_Text['day'] or \ + unit == self.ptc.Target_Text['dy'] or \ + unit == self.ptc.Target_Text['d']: + if offset == 0: + sourceTime = (yr, mth, dy, 17, 0, 0, wd, yd, isdst) + elif offset == 2: + start = datetime.datetime(yr, mth, dy, hr, mn, sec) + target = start + datetime.timedelta(days=1) + sourceTime = target.timetuple() + else: + start = datetime.datetime(yr, mth, dy, 9, 0, 0) + target = start + datetime.timedelta(days=offset) + sourceTime = target.timetuple() + + flag = 1 + + if unit == self.ptc.Target_Text['hour'] or \ + unit == self.ptc.Target_Text['hr']: + if offset == 0: + sourceTime = (yr, mth, dy, hr, 0, 0, wd, yd, isdst) + else: + start = datetime.datetime(yr, mth, dy, hr, 0, 0) + target = start + datetime.timedelta(hours=offset) + sourceTime = target.timetuple() + + flag = 1 + + if unit == self.ptc.Target_Text['year'] or \ + unit == self.ptc.Target_Text['yr'] or \ + unit == self.ptc.Target_Text['y']: + if offset == 0: + sourceTime = (yr, 12, 31, hr, mn, sec, wd, yd, isdst) + elif offset == 2: + sourceTime = (yr + 1, mth, dy, hr, mn, sec, wd, yd, isdst) + else: + sourceTime = (yr + offset, 1, 1, 9, 0, 0, wd, yd, isdst) + + flag = 1 + + if flag == 0: + m = self.CRE_WEEKDAY.match(unit) + if m is not None: + wkdy = m.group() + wkdy = self.ptc.WeekDays[wkdy] + + if offset == 0: + diff = wkdy - wd + start = datetime.datetime(yr, mth, dy, 9, 0, 0) + target = start + datetime.timedelta(days=diff) + sourceTime = target.timetuple() + else: + diff = wkdy - wd + start = datetime.datetime(yr, mth, dy, 9, 0, 0) + target = start + datetime.timedelta(days=diff + 7 * offset) + sourceTime = target.timetuple() + + flag = 1 + + if flag == 0: + m = self.CRE_TIME.match(unit) + if m is not None: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst), self.invalidFlag = self.parse(unit) + start = datetime.datetime(yr, mth, dy, hr, mn, sec) + target = start + datetime.timedelta(days=offset) + sourceTime = target.timetuple() + + flag = 1 + self.modifierFlag = 0 + + # if the word after next is a number, the string is likely + # to be something like "next 4 hrs" for which we have to + # combine the units with the rest of the string + if flag == 0: + if offset < 0: + # if offset is negative, the unit has to be made negative + unit = '-%s' % unit + + chunk2 = '%s %s' % (unit, chunk2) + + self.modifierFlag = 0 + + return '%s %s' % (chunk1, chunk2), sourceTime + + + def _evalModifier2(self, modifier, chunk1 , chunk2, sourceTime): + """ + Evaluate the modifier string and following text (passed in + as chunk1 and chunk2) and if they match any known modifiers + calculate the delta and apply it to sourceTime + + @type modifier: string + @param modifier: modifier text to apply to sourceTime + @type chunk1: string + @param chunk1: first text chunk that followed modifier (if any) + @type chunk2: string + @param chunk2: second text chunk that followed modifier (if any) + @type sourceTime: datetime + @param sourceTime: datetime value to use as the base + + @rtype: tuple + @return: tuple of any remaining text and the modified sourceTime + """ + offset = self.ptc.Modifiers[modifier] + digit = r'\d+' + + if sourceTime is not None: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = sourceTime + else: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = time.localtime() + + self.modifier2Flag = 0 + + # If the string after the negative modifier starts with + # digits, then it is likely that the string is similar to + # " before 3 days" or 'evening prior to 3 days'. + # In this case, the total time is calculated by subtracting + # '3 days' from the current date. + # So, we have to identify the quantity and negate it before + # parsing the string. + # This is not required for strings not starting with digits + # since the string is enough to calculate the sourceTime + if offset < 0: + m = re.match(digit, string.strip(chunk2)) + if m is not None: + qty = int(m.group()) * -1 + chunk2 = chunk2[m.end():] + chunk2 = '%d%s' % (qty, chunk2) + + sourceTime, flag = self.parse(chunk2, sourceTime) + + if chunk1 != '': + if offset < 0: + m = re.match(digit, string.strip(chunk1)) + if m is not None: + qty = int(m.group()) * -1 + chunk1 = chunk1[m.end():] + chunk1 = '%d%s' % (qty, chunk1) + + sourceTime, flag = self.parse(chunk1, sourceTime) + + return '', sourceTime + + + def _evalString(self, datetimeString, sourceTime=None): + """ + Calculate the datetime based on flags set by the L{parse()} routine + + Examples handled:: + RFC822, W3CDTF formatted dates + HH:MM[:SS][ am/pm] + MM/DD/YYYY + DD MMMM YYYY + + @type datetimeString: string + @param datetimeString: text to try and parse as more "traditional" date/time text + @type sourceTime: datetime + @param sourceTime: datetime value to use as the base + + @rtype: datetime + @return: calculated datetime value or current datetime if not parsed + """ + s = string.strip(datetimeString) + + # Given string date is a RFC822 date + if sourceTime is None: + sourceTime = _parse_date_rfc822(s) + + # Given string date is a W3CDTF date + if sourceTime is None: + sourceTime = _parse_date_w3dtf(s) + + if sourceTime is None: + s = s.lower() + + # Given string is in the format HH:MM(:SS)(am/pm) + if self.meridianFlag == 1: + if sourceTime is None: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = time.localtime() + else: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = sourceTime + + m = self.CRE_TIMEHMS2.search(s) + if m is not None: + dt = s[:m.start('meridian')].strip() + if len(dt) <= 2: + hr = int(dt) + mn = 0 + sec = 0 + else: + hr, mn, sec = _extract_time(m) + + if hr == 24: + hr = 0 + + sourceTime = (yr, mth, dy, hr, mn, sec, wd, yd, isdst) + meridian = m.group('meridian') + + if (re.compile("a").search(meridian)) and hr == 12: + sourceTime = (yr, mth, dy, 0, mn, sec, wd, yd, isdst) + if (re.compile("p").search(meridian)) and hr < 12: + sourceTime = (yr, mth, dy, hr+12, mn, sec, wd, yd, isdst) + + # invalid time + if hr > 24 or mn > 59 or sec > 59: + sourceTime = time.localtime() + self.invalidFlag = 1 + + self.meridianFlag = 0 + + # Given string is in the format HH:MM(:SS) + if self.timeFlag == 1: + if sourceTime is None: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = time.localtime() + else: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = sourceTime + + m = self.CRE_TIMEHMS.search(s) + if m is not None: + hr, mn, sec = _extract_time(m) + if hr == 24: + hr = 0 + + if hr > 24 or mn > 59 or sec > 59: + # invalid time + sourceTime = time.localtime() + self.invalidFlag = 1 + else: + sourceTime = (yr, mth, dy, hr, mn, sec, wd, yd, isdst) + + self.timeFlag = 0 + + # Given string is in the format 07/21/2006 + if self.dateStdFlag == 1: + sourceTime = self.parseDate(s) + self.dateStdFlag = 0 + + # Given string is in the format "May 23rd, 2005" + if self.dateStrFlag == 1: + sourceTime = self.parseDateText(s) + self.dateStrFlag = 0 + + # Given string is a weekday + if self.weekdyFlag == 1: + yr, mth, dy, hr, mn, sec, wd, yd, isdst = time.localtime() + start = datetime.datetime(yr, mth, dy, hr, mn, sec) + wkDy = self.ptc.WeekDays[s] + + if wkDy > wd: + qty = wkDy - wd + target = start + datetime.timedelta(days=qty) + wd = wkDy + else: + qty = 6 - wd + wkDy + 1 + target = start + datetime.timedelta(days=qty) + wd = wkDy + + sourceTime = target.timetuple() + self.weekdyFlag = 0 + + # Given string is a natural language time string like lunch, midnight, etc + if self.timeStrFlag == 1: + if sourceTime is None: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = time.localtime() + else: + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = sourceTime + + sources = { 'now': (yr, mth, dy, hr, mn, sec, wd, yd, isdst), + 'noon': (yr, mth, dy, 12, 0, 0, wd, yd, isdst), + 'lunch': (yr, mth, dy, 12, 0, 0, wd, yd, isdst), + 'morning': (yr, mth, dy, 6, 0, 0, wd, yd, isdst), + 'breakfast': (yr, mth, dy, 8, 0, 0, wd, yd, isdst), + 'dinner': (yr, mth, dy, 19, 0, 0, wd, yd, isdst), + 'evening': (yr, mth, dy, 18, 0, 0, wd, yd, isdst), + 'midnight': (yr, mth, dy, 0, 0, 0, wd, yd, isdst), + 'night': (yr, mth, dy, 21, 0, 0, wd, yd, isdst), + 'tonight': (yr, mth, dy, 21, 0, 0, wd, yd, isdst), + } + + if s in sources: + sourceTime = sources[s] + else: + sourceTime = time.localtime() + self.invalidFlag = 1 + + self.timeStrFlag = 0 + + # Given string is a natural language date string like today, tomorrow.. + if self.dayStrFlag == 1: + if sourceTime is None: + sourceTime = time.localtime() + + (yr, mth, dy, hr, mn, sec, wd, yd, isdst) = sourceTime + + sources = { 'tomorrow': 1, + 'today': 0, + 'yesterday': -1, + } + + start = datetime.datetime(yr, mth, dy, 9, 0, 0) + target = start + datetime.timedelta(days=sources[s]) + sourceTime = target.timetuple() + + self.dayStrFlag = 0 + + # Given string is a time string with units like "5 hrs 30 min" + if self.unitsFlag == 1: + modifier = '' # TODO + + if sourceTime is None: + sourceTime = time.localtime() + + m = self.CRE_UNITS.search(s) + if m is not None: + units = m.group('units') + quantity = s[:m.start('units')] + + sourceTime = self._buildTime(sourceTime, quantity, modifier, units) + self.unitsFlag = 0 + + # Given string is a time string with single char units like "5 h 30 m" + if self.qunitsFlag == 1: + modifier = '' # TODO + + if sourceTime is None: + sourceTime = time.localtime() + + m = self.CRE_QUNITS.search(s) + if m is not None: + units = m.group('qunits') + quantity = s[:m.start('qunits')] + + sourceTime = self._buildTime(sourceTime, quantity, modifier, units) + self.qunitsFlag = 0 + + # Given string does not match anything + if sourceTime is None: + sourceTime = time.localtime() + self.invalidFlag = 1 + + return sourceTime + + + def parse(self, datetimeString, sourceTime=None): + """ + Splits the L{datetimeString} into tokens, finds the regex patters + that match and then calculates a datetime value from the chunks + + if L{sourceTime} is given then the datetime value will be calcualted + from that datetime, otherwise from the current datetime. + + @type datetimeString: string + @param datetimeString: datetime text to evaluate + @type sourceTime: datetime + @param sourceTime: datetime value to use as the base + + @rtype: tuple + @return: tuple of any remaining text and the modified sourceTime + """ + s = string.strip(datetimeString.lower()) + dateStr = '' + parseStr = '' + totalTime = sourceTime + + self.invalidFlag = 0 + + if s == '' : + if sourceTime is not None: + return (sourceTime, 0) + else: + return (time.localtime(), 1) + + while len(s) > 0: + flag = 0 + chunk1 = '' + chunk2 = '' + + if _debug: + print 'parse (top of loop): [%s][%s]' % (s, parseStr) + + if parseStr == '': + # Modifier like next\prev.. + m = self.CRE_MODIFIER.search(s) + if m is not None: + self.modifierFlag = 1 + if (m.group('modifier') != s): + # capture remaining string + parseStr = m.group('modifier') + chunk1 = string.strip(s[:m.start('modifier')]) + chunk2 = string.strip(s[m.end('modifier'):]) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Modifier like from\after\prior.. + m = self.CRE_MODIFIER2.search(s) + if m is not None: + self.modifier2Flag = 1 + if (m.group('modifier') != s): + # capture remaining string + parseStr = m.group('modifier') + chunk1 = string.strip(s[:m.start('modifier')]) + chunk2 = string.strip(s[m.end('modifier'):]) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # String date format + m = self.CRE_DATE3.search(s) + if m is not None: + self.dateStrFlag = 1 + if (m.group('date') != s): + # capture remaining string + parseStr = m.group('date') + chunk1 = s[:m.start('date')] + chunk2 = s[m.end('date'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Standard date format + m = self.CRE_DATE.search(s) + if m is not None: + self.dateStdFlag = 1 + if (m.group('date') != s): + # capture remaining string + parseStr = m.group('date') + chunk1 = s[:m.start('date')] + chunk2 = s[m.end('date'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Natural language day strings + m = self.CRE_DAY.search(s) + if m is not None: + self.dayStrFlag = 1 + if (m.group('day') != s): + # capture remaining string + parseStr = m.group('day') + chunk1 = s[:m.start('day')] + chunk2 = s[m.end('day'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Quantity + Units + m = self.CRE_UNITS.search(s) + if m is not None: + self.unitsFlag = 1 + if (m.group('qty') != s): + # capture remaining string + parseStr = m.group('qty') + chunk1 = s[:m.start('qty')] + chunk2 = s[m.end('qty'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Quantity + Units + m = self.CRE_QUNITS.search(s) + if m is not None: + self.qunitsFlag = 1 + if (m.group('qty') != s): + # capture remaining string + parseStr = m.group('qty') + chunk1 = s[:m.start('qty')] + chunk2 = s[m.end('qty'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Weekday + m = self.CRE_WEEKDAY.search(s) + if m is not None: + self.weekdyFlag = 1 + if (m.group('weekday') != s): + # capture remaining string + parseStr = m.group() + chunk1 = s[:m.start('weekday')] + chunk2 = s[m.end('weekday'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # Natural language time strings + m = self.CRE_TIME.search(s) + if m is not None: + self.timeStrFlag = 1 + if (m.group('time') != s): + # capture remaining string + parseStr = m.group('time') + chunk1 = s[:m.start('time')] + chunk2 = s[m.end('time'):] + s = '%s %s' % (chunk1, chunk2) + flag = 1 + else: + parseStr = s + + if parseStr == '': + # HH:MM(:SS) am/pm time strings + m = self.CRE_TIMEHMS2.search(s) + if m is not None: + self.meridianFlag = 1 + if m.group('minutes') is not None: + if m.group('seconds') is not None: + parseStr = '%s:%s:%s %s' % (m.group('hours'), m.group('minutes'), m.group('seconds'), m.group('meridian')) + else: + parseStr = '%s:%s %s' % (m.group('hours'), m.group('minutes'), m.group('meridian')) + else: + parseStr = '%s %s' % (m.group('hours'), m.group('meridian')) + + chunk1 = s[:m.start('hours')] + chunk2 = s[m.end('meridian'):] + + s = '%s %s' % (chunk1, chunk2) + flag = 1 + + if parseStr == '': + # HH:MM(:SS) time strings + m = self.CRE_TIMEHMS.search(s) + if m is not None: + self.timeFlag = 1 + if m.group('seconds') is not None: + parseStr = '%s:%s:%s' % (m.group('hours'), m.group('minutes'), m.group('seconds')) + chunk1 = s[:m.start('hours')] + chunk2 = s[m.end('seconds'):] + else: + parseStr = '%s:%s' % (m.group('hours'), m.group('minutes')) + chunk1 = s[:m.start('hours')] + chunk2 = s[m.end('minutes'):] + + s = '%s %s' % (chunk1, chunk2) + flag = 1 + + # if string does not match any regex, empty string to come out of the while loop + if flag is 0: + s = '' + + if _debug: + print 'parse (bottom) [%s][%s][%s][%s]' % (s, parseStr, chunk1, chunk2) + print 'invalid [%d] weekday [%d] dateStd [%d] dateStr [%d] time [%d] timeStr [%d] meridian [%d]' % \ + (self.invalidFlag, self.weekdyFlag, self.dateStdFlag, self.dateStrFlag, self.timeFlag, self.timeStrFlag, self.meridianFlag) + print 'dayStr [%d] modifier [%d] modifier2 [%d] units [%d] qunits[%d]' % \ + (self.dayStrFlag, self.modifierFlag, self.modifier2Flag, self.unitsFlag, self.qunitsFlag) + + # evaluate the matched string + if parseStr != '': + if self.modifierFlag == 1: + t, totalTime = self._evalModifier(parseStr, chunk1, chunk2, totalTime) + + return self.parse(t, totalTime) + + elif self.modifier2Flag == 1: + s, totalTime = self._evalModifier2(parseStr, chunk1, chunk2, totalTime) + else: + totalTime = self._evalString(parseStr, totalTime) + parseStr = '' + + # String is not parsed at all + if totalTime is None or totalTime == sourceTime: + totalTime = time.localtime() + self.invalidFlag = 1 + + return (totalTime, self.invalidFlag) + + + def inc(self, source, month=None, year=None): + """ + Takes the given date, or current date if none is passed, and + increments it according to the values passed in by month + and/or year. + + This routine is needed because the timedelta() routine does + not allow for month or year increments. + + @type source: datetime + @param source: datetime value to increment + @type month: integer + @param month: optional number of months to increment + @type year: integer + @param year: optional number of years to increment + + @rtype: datetime + @return: L{source} incremented by the number of months and/or years + """ + yr = source.year + mth = source.month + + if year: + try: + yi = int(year) + except ValueError: + yi = 0 + + yr += yi + + if month: + try: + mi = int(month) + except ValueError: + mi = 0 + + m = abs(mi) + y = m / 12 # how many years are in month increment + m = m % 12 # get remaining months + + if mi < 0: + mth = mth - m # sub months from start month + if mth < 1: # cross start-of-year? + y -= 1 # yes - decrement year + mth += 12 # and fix month + else: + mth = mth + m # add months to start month + if mth > 12: # cross end-of-year? + y += 1 # yes - increment year + mth -= 12 # and fix month + + yr += y + + d = source.replace(year=yr, month=mth) + + return source + (d - source) + diff -r fa0b7d2d998b -r 6b0ea72d7665 MoinMoin/support/parsedatetime/parsedatetime_consts.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/parsedatetime/parsedatetime_consts.py Mon Aug 21 02:30:05 2006 +0200 @@ -0,0 +1,278 @@ +#!/usr/bin/env python + +""" +CalendarConstants defines all constants used by parsedatetime.py. +""" + +__license__ = """Copyright (c) 2004-2006 Mike Taylor, All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +__author__ = 'Mike Taylor ' +__contributors__ = ['Darshana Chhajed ', + ] + + +class CalendarConstants: + def __init__(self): + self.Locale = 'American' + + self.TIMESEP = ':' + + self.RE_SPECIAL = r'(?P^[in|on|of|at]+)\s+' + self.RE_UNITS = r'(?P(-?\d+\s*(?P((hour|hr|minute|min|second|sec|day|dy|week|wk|month|mth|year|yr)s?))))' + self.RE_QUNITS = r'(?P(-?\d+\s?(?Ph|m|s|d|w|m|y)(\s|,|$)))' + self.RE_MODIFIER = r'(?P(previous|prev|last|next|this|eo|(end\sof)|(in\sa)))' + self.RE_MODIFIER2 = r'(?P(from|before|after|ago|prior))' + self.RE_TIMEHMS = r'(?P\d\d?)(?P:|)(?P\d\d)(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' + self.RE_TIMEHMS2 = r'(?P(\d\d?))((?P:|)(?P(\d\d?))(?:(?P=tsep)(?P\d\d?(?:[.,]\d+)?))?)?\s?(?P(am|pm|a.m.|p.m.|a|p))' + self.RE_DATE = r'(?P\d+([/.\\]\d+)+)' + self.RE_DATE2 = r'[/.\\-]' + self.RE_DATE3 = r'(?P((?P(january|february|march|april|may|june|july|august|september|october|november|december))\s?((?P\d\d?)(\s|rd|st|nd|th|,|$)+)?(?P\d\d\d\d)?))' + self.RE_MONTH = r'(?P((?P(january|february|march|april|may|june|july|august|september|october|november|december))(\s?(?P(\d\d\d\d)))?))' + self.RE_WEEKDAY = r'(?P(monday|mon|tuesday|tue|wednesday|wed|thursday|thu|friday|saturday|sat|sunday|sun))' + self.RE_DAY = r'(?P(today|tomorrow|yesterday))' + self.RE_TIME = r'\s*(?P