Vedeli ste, že slovník na stránke slovník.sk je momentálne celkom jednoducho parsovateľný ? Samozrejme ak to začne niekto zneužívať predpokladám, že to páni vývojári zmenia, ale momentálne sa dá celkom jednoducho dotazovať vďaka rozumnému formátu URL pri dotazovaní cez HTTP GET. Je to z programatického hľadiska oveľa prehľadnejšie ako napríklad taký cestovný poriadok na cp.sk. Tutok je môj java kód, na odoslanie jedného dotazu. Určite by sa to dalo napísať aj krajšie a vôbec som to netestoval, to len tak pre demonštračné účely.
package sk.linhard.slovnik;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Parser for slovnik.sk
* @author Michal Linhard
*
*/
public class SlovnikRequest {
public static class SlovnikTranslation {
private String from;
private String[] to;
public SlovnikTranslation(String aFrom, String[] aTo) {
from = aFrom;
to = aTo;
}
public String getFrom() {
return from;
}
public String[] getTo() {
return to.clone();
}
}
public static final String DEFAULT_URL = "http://slovnik.azet.sk";
private String term;
private String lFrom;
private String lTo;
/**
* Create a request object.
* @param aTerm Term to search for
* @param aLangFrom Language to translate from
* @param aLangTo Language to translate to
*/
public SlovnikRequest(String aTerm, String aLangFrom, String aLangTo) {
term = aTerm;
lFrom = aLangFrom;
lTo = aLangTo;
}
private String encodeReqURL(String aTerm, String aLangFrom, String aLangTo) {
StringBuffer sb = new StringBuffer();
sb.append(DEFAULT_URL);
sb.append("/?q=");
sb.append(aTerm.replace(' ', '+'));
sb.append("&l=");
sb.append(aLangFrom);
sb.append("-");
sb.append(aLangTo);
return sb.toString();
}
private String slurp (InputStream in) throws IOException {
try {
StringBuffer out = new StringBuffer();
byte[] b = new byte[4096];
for (int n; (n = in.read(b)) != -1;) {
out.append(new String(b, 0, n, "UTF-8"));
}
return out.toString();
} finally {
if (in != null) {
in.close();
}
}
}
private List<String> findPTables(String src) {
return findTags(src, "<table class=\"p\">", "</table>");
}
private List<String> findTableRows(String src) {
return findTags(src, "<tr>", "</tr>");
}
private List<String> findTags(String src, String tagBegin, String tagEnd) {
List<String> r = new ArrayList<String>();
int idx = src.indexOf(tagBegin);
while (idx != -1) {
int idxend = src.indexOf(tagEnd, idx);
if (idxend == -1) {
throw new RuntimeException("tag not ended");
}
r.add(src.substring(idx + tagBegin.length(), idxend));
idx = src.indexOf(tagBegin, idxend);
}
return r;
}
public SlovnikTranslation[] perform() {
try {
URL u = new URL(encodeReqURL(URLEncoder.encode(term, "UTF-8"), lFrom, lTo));
System.out.println("---- QueryURL: " + u.toString() + " ----");
List<SlovnikTranslation> result = new ArrayList<SlovnikTranslation>();
String input = slurp(u.openStream());
for (String eachPtable : findPTables(input)) {
String termFrom = null;
List<String> toTerms = new ArrayList<String>();
for (String row : findTableRows(eachPtable)) {
Matcher mTR = Pattern.compile(
"<td class=\"z\"[^>]*>\\s*(.*)\\s*</td>\\s*" +
"<td class=\"sipka\">\\s* → \\s*</td>\\s*" +
"<td class=\"do\"[^>]*>(.*)\\s<a[^>]*>[^<]*</a>\\s*" +
"<div[^>]*>[^<]*</div>\\s*</td>", Pattern.DOTALL).matcher(row);
if (mTR.find()) {
if (termFrom == null) {
termFrom = mTR.group(1);
}
toTerms.add(mTR.group(2));
} else {
System.out.println("TERMS NOT FOUND FOR ROW: " + row);
}
}
if (termFrom != null && !toTerms.isEmpty()) {
result.add(new SlovnikTranslation(removeSpans(termFrom), toTerms.toArray(new String[toTerms.size()])));
}
}
return result.toArray(new SlovnikTranslation[result.size()]);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private String removeSpans(String atermFrom) {
Matcher m = Pattern.compile(
"[^<>]*<span>([^<>]*)<span class=\"highlight\">([^<>]*)</span>([^<>]*)</span>[^<>]*")
.matcher(atermFrom);
if (m.matches()) {
return new StringBuffer().append(m.group(1)).append("|").append(m.group(2)).append("|").append(m.group(3)).toString();
} else {
return atermFrom;
}
}
public static void main(String[] args) throws Exception {
SlovnikTranslation[] result = new SlovnikRequest("žena", "sk", "es").perform();
for (SlovnikTranslation t : result) {
System.out.println("---- translation of \"" + t.getFrom()+"\" ----");
for (String s : t.getTo()) {
System.out.println(s);
}
}
}
}
|