commit
e80b16d979
21
boilerpipe-1.2.0-libdir-patch
Normal file
21
boilerpipe-1.2.0-libdir-patch
Normal file
@ -0,0 +1,21 @@
|
||||
--- build.xml 2011-05-28 16:56:41.000000000 +0200
|
||||
+++ build.xml-gil 2011-08-15 17:57:57.279492364 +0200
|
||||
@@ -53,7 +53,7 @@
|
||||
|
||||
<property name="build.main" value="${build.dir}/main" />
|
||||
<property name="build.demo" value="${build.dir}/demo" />
|
||||
- <property name="lib.dir" value="${app.dir}/lib" />
|
||||
+ <property name="lib.dir" value="/usr/share/java" />
|
||||
<property name="src.main" value="${app.dir}/src/main" />
|
||||
<property name="src.demo" value="${app.dir}/src/demo" />
|
||||
<property name="dist.dir" value="${app.dir}/dist" />
|
||||
@@ -67,7 +67,8 @@
|
||||
|
||||
<path id="classpath.libs">
|
||||
<fileset dir="${lib.dir}">
|
||||
- <include name="**/*.jar" />
|
||||
+ <include name="nekohtml.jar" />
|
||||
+ <include name="xerces-j2.jar" />
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
2228
boilerpipe-1.2.0-nekohtml-patch
Normal file
2228
boilerpipe-1.2.0-nekohtml-patch
Normal file
File diff suppressed because it is too large
Load Diff
BIN
boilerpipe-1.2.0-src.tar.gz
Normal file
BIN
boilerpipe-1.2.0-src.tar.gz
Normal file
Binary file not shown.
35
boilerpipe-1.2.0.pom
Normal file
35
boilerpipe-1.2.0.pom
Normal file
@ -0,0 +1,35 @@
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>de.l3s.boilerpipe</groupId>
|
||||
<artifactId>boilerpipe</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<version>1.2.0</version>
|
||||
<url>http://code.google.com/p/boilerpipe/</url>
|
||||
<licenses>
|
||||
<license>
|
||||
<name>Apache License 2.0</name>
|
||||
</license>
|
||||
</licenses>
|
||||
<name>Boilerpipe -- Boilerplate Removal and Fulltext Extraction from HTML pages</name>
|
||||
<description>The boilerpipe library provides algorithms to detect and remove the surplus "clutter" (boilerplate, templates) around the main textual content of a web page.
|
||||
|
||||
The library already provides specific strategies for common tasks (for example: news article extraction) and may also be easily extended for individual problem settings.
|
||||
|
||||
Extracting content is very fast (milliseconds), just needs the input document (no global or site-level information required) and is usually quite accurate.
|
||||
|
||||
Boilerpipe is a Java library written by Christian Kohlschütter. It is released under the Apache License 2.0.
|
||||
|
||||
The algorithms used by the library are based on (and extending) some concepts of the paper "Boilerplate Detection using Shallow Text Features" by Christian Kohlschütter et al., presented at WSDM 2010 -- The Third ACM International Conference on Web Search and Data Mining New York City, NY USA.
|
||||
</description>
|
||||
<scm>
|
||||
<connection>scm:svn:http://boilerpipe.googlecode.com/svn/trunk/</connection>
|
||||
<url>http://code.google.com/p/boilerpipe/source/browse/</url>
|
||||
</scm>
|
||||
<developers>
|
||||
<developer>
|
||||
<name>Christian Kohlschütter</name>
|
||||
</developer>
|
||||
</developers>
|
||||
</project>
|
||||
121
boilerpipe.spec
Normal file
121
boilerpipe.spec
Normal file
@ -0,0 +1,121 @@
|
||||
Name: boilerpipe
|
||||
Version: 1.2.0
|
||||
Release: 1
|
||||
Summary: Boilerplate Removal and Fulltext Extraction from HTML pages
|
||||
License: ASL 2.0
|
||||
Url: https://github.com/kohlschutter/boilerpipe
|
||||
Source0: http://boilerpipe.googlecode.com/files/%{name}-%{version}-src.tar.gz
|
||||
Source1: http://boilerpipe.googlecode.com/svn/repo/de/l3s/%{name}/%{name}/%{version}/%{name}-%{version}.pom
|
||||
Patch0: %{name}-1.2.0-libdir-patch
|
||||
Patch1: %{name}-1.2.0-nekohtml-patch
|
||||
BuildRequires: ant java-devel javapackages-local nekohtml xerces-j2
|
||||
BuildArch: noarch
|
||||
|
||||
%description
|
||||
The boilerpipe library provides algorithms to detect and
|
||||
remove the surplus "clutter" (boilerplate, templates)
|
||||
around the main textual content of a web page.
|
||||
The library already provides specific strategies
|
||||
for common tasks (for example: news article extraction) and
|
||||
may also be easily extended for individual problem settings.
|
||||
Extracting content is very fast (milliseconds), just needs the
|
||||
input document (no global or site-level information required) and
|
||||
is usually quite accurate.
|
||||
|
||||
%package javadoc
|
||||
Summary: Javadoc for %{name}
|
||||
|
||||
%description javadoc
|
||||
This package contains javadoc for %{name}.
|
||||
|
||||
%prep
|
||||
%setup -q
|
||||
find . -iname '*.jar' -delete
|
||||
find . -iname '*.class' -delete
|
||||
%patch0 -p0
|
||||
cp %{SOURCE1} pom.xml
|
||||
%patch1 -p1
|
||||
for s in src/main/de/l3s/boilerpipe/BoilerpipeInput.java \
|
||||
src/main/de/l3s/boilerpipe/BoilerpipeInput.java \
|
||||
src/main/de/l3s/boilerpipe/BoilerpipeFilter.java \
|
||||
src/main/de/l3s/boilerpipe/BoilerpipeExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/BoilerpipeProcessingException.java \
|
||||
src/main/de/l3s/boilerpipe/conditions/TextBlockCondition.java \
|
||||
src/main/de/l3s/boilerpipe/document/TextBlock.java \
|
||||
src/main/de/l3s/boilerpipe/document/TextDocumentStatistics.java \
|
||||
src/main/de/l3s/boilerpipe/document/TextDocument.java \
|
||||
src/main/de/l3s/boilerpipe/estimators/SimpleEstimator.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/LargestContentExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/DefaultExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/ExtractorBase.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/CommonExtractors.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/CanolaExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/ArticleExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/InvertedFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/InvertedFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java \
|
||||
src/main/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java \
|
||||
src/main/de/l3s/boilerpipe/labels/DefaultLabels.java \
|
||||
src/main/de/l3s/boilerpipe/labels/ConditionalLabelAction.java \
|
||||
src/main/de/l3s/boilerpipe/labels/LabelAction.java \
|
||||
src/main/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java \
|
||||
src/main/de/l3s/boilerpipe/sax/HTMLHighlighter.java \
|
||||
src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java \
|
||||
src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java \
|
||||
src/main/de/l3s/boilerpipe/sax/TagActionMap.java \
|
||||
src/main/de/l3s/boilerpipe/sax/InputSourceable.java \
|
||||
src/main/de/l3s/boilerpipe/sax/HTMLDocument.java \
|
||||
src/main/de/l3s/boilerpipe/sax/CommonTagActions.java \
|
||||
src/main/de/l3s/boilerpipe/sax/DefaultTagActionMap.java \
|
||||
src/main/de/l3s/boilerpipe/sax/HTMLFetcher.java \
|
||||
src/main/de/l3s/boilerpipe/sax/TagAction.java \
|
||||
src/main/de/l3s/boilerpipe/sax/MarkupTagAction.java \
|
||||
src/main/de/l3s/boilerpipe/util/UnicodeTokenizer.java;do
|
||||
native2ascii -encoding UTF8 ${s} ${s}
|
||||
done
|
||||
|
||||
%build
|
||||
ant -Dapp.javaversion=1.6
|
||||
|
||||
%install
|
||||
%mvn_artifact pom.xml dist/%{name}-%{version}.jar
|
||||
%mvn_file de.l3s.%{name}:%{name} %{name}
|
||||
%mvn_install -J javadoc/1.2
|
||||
install -pm 644 dist/%{name}-demo-%{version}.jar \
|
||||
%{buildroot}%{_javadir}/%{name}-demo.jar
|
||||
|
||||
%files -f .mfiles
|
||||
%{_javadir}/%{name}-demo.jar
|
||||
%license LICENSE.txt NOTICE.txt
|
||||
|
||||
%files javadoc -f .mfiles-javadoc
|
||||
%license LICENSE.txt NOTICE.txt
|
||||
|
||||
%changelog
|
||||
* Sat Jul 25 2020 chengzihan <chengzihan2@huawei.com> - 1.2.0-1
|
||||
- Package init
|
||||
4
boilerpipe.yaml
Normal file
4
boilerpipe.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
version_control: NA
|
||||
src_repo: NA
|
||||
tag_prefix: NA
|
||||
seperator: NA
|
||||
Loading…
x
Reference in New Issue
Block a user