mirror of
https://github.com/fankes/termux-app.git
synced 2025-09-06 02:35:19 +08:00
improve url matching regex
Now it support complex URLs and some other schemes beyond just http/ftp.
This commit is contained in:
committed by
Fredrik Fornwall
parent
5b7e40638c
commit
c19909cef1
@@ -654,19 +654,86 @@ public final class TermuxActivity extends Activity implements ServiceConnection
|
|||||||
}
|
}
|
||||||
|
|
||||||
static LinkedHashSet<CharSequence> extractUrls(String text) {
|
static LinkedHashSet<CharSequence> extractUrls(String text) {
|
||||||
// Pattern for recognizing a URL, based off RFC 3986
|
|
||||||
// http://stackoverflow.com/questions/5713558/detect-and-extract-url-from-a-string
|
StringBuilder regex_sb = new StringBuilder();
|
||||||
|
|
||||||
|
regex_sb.append("("); // Begin first matching group.
|
||||||
|
regex_sb.append("(?:"); // Begin scheme group.
|
||||||
|
regex_sb.append("dav|"); // The DAV proto.
|
||||||
|
regex_sb.append("dict|"); // The DICT proto.
|
||||||
|
regex_sb.append("dns|"); // The DNS proto.
|
||||||
|
regex_sb.append("file|"); // File path.
|
||||||
|
regex_sb.append("finger|"); // The Finger proto.
|
||||||
|
regex_sb.append("ftp(?:s?)|"); // The FTP proto.
|
||||||
|
regex_sb.append("git|"); // The Git proto.
|
||||||
|
regex_sb.append("gopher|"); // The Gopher proto.
|
||||||
|
regex_sb.append("http(?:s?)|"); // The HTTP proto.
|
||||||
|
regex_sb.append("imap(?:s?)|"); // The IMAP proto.
|
||||||
|
regex_sb.append("irc(?:[6s]?)|"); // The IRC proto.
|
||||||
|
regex_sb.append("ip[fn]s|"); // The IPFS proto.
|
||||||
|
regex_sb.append("ldap(?:s?)|"); // The LDAP proto.
|
||||||
|
regex_sb.append("pop3(?:s?)|"); // The POP3 proto.
|
||||||
|
regex_sb.append("redis(?:s?)|"); // The Redis proto.
|
||||||
|
regex_sb.append("rsync|"); // The Rsync proto.
|
||||||
|
regex_sb.append("rtsp(?:[su]?)|"); // The RTSP proto.
|
||||||
|
regex_sb.append("sftp|"); // The SFTP proto.
|
||||||
|
regex_sb.append("smb(?:s?)|"); // The SAMBA proto.
|
||||||
|
regex_sb.append("smtp(?:s?)|"); // The SMTP proto.
|
||||||
|
regex_sb.append("svn(?:(?:\\+ssh)?)|"); // The Subversion proto.
|
||||||
|
regex_sb.append("tcp|"); // The TCP proto.
|
||||||
|
regex_sb.append("telnet|"); // The Telnet proto.
|
||||||
|
regex_sb.append("tftp|"); // The TFTP proto.
|
||||||
|
regex_sb.append("udp|"); // The UDP proto.
|
||||||
|
regex_sb.append("vnc|"); // The VNC proto.
|
||||||
|
regex_sb.append("ws(?:s?)"); // The Websocket proto.
|
||||||
|
regex_sb.append(")://"); // End scheme group.
|
||||||
|
regex_sb.append(")"); // End first matching group.
|
||||||
|
|
||||||
|
|
||||||
|
// Begin second matching group.
|
||||||
|
regex_sb.append("(");
|
||||||
|
|
||||||
|
// User name and/or password in format 'user:pass@'.
|
||||||
|
regex_sb.append("(?:\\S+(?::\\S*)?@)?");
|
||||||
|
|
||||||
|
// Begin host group.
|
||||||
|
regex_sb.append("(?:");
|
||||||
|
|
||||||
|
// IP address (from http://www.regular-expressions.info/examples.html).
|
||||||
|
regex_sb.append("(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|");
|
||||||
|
|
||||||
|
// Host name or domain.
|
||||||
|
regex_sb.append("(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)(?:(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))?|");
|
||||||
|
|
||||||
|
// Just path. Used in case of 'file://' scheme.
|
||||||
|
regex_sb.append("/(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)");
|
||||||
|
|
||||||
|
// End host group.
|
||||||
|
regex_sb.append(")");
|
||||||
|
|
||||||
|
// Port number.
|
||||||
|
regex_sb.append("(?::\\d{1,5})?");
|
||||||
|
|
||||||
|
// Resource path with optional query string.
|
||||||
|
regex_sb.append("(?:/[a-zA-Z0-9:@%\\-._~!$&()*+,;=?/]*)?");
|
||||||
|
|
||||||
|
// End second matching group.
|
||||||
|
regex_sb.append(")");
|
||||||
|
|
||||||
final Pattern urlPattern = Pattern.compile(
|
final Pattern urlPattern = Pattern.compile(
|
||||||
"(?:^|[\\W])((ht|f)tp(s?)://|www\\.)" + "(([\\w\\-]+\\.)+?([\\w\\-.~]+/?)*" + "[\\p{Alnum}.,%_=?&#\\-+()\\[\\]*$~@!:/{};']*)",
|
regex_sb.toString(),
|
||||||
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
|
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
|
||||||
|
|
||||||
LinkedHashSet<CharSequence> urlSet = new LinkedHashSet<>();
|
LinkedHashSet<CharSequence> urlSet = new LinkedHashSet<>();
|
||||||
Matcher matcher = urlPattern.matcher(text);
|
Matcher matcher = urlPattern.matcher(text);
|
||||||
|
|
||||||
while (matcher.find()) {
|
while (matcher.find()) {
|
||||||
int matchStart = matcher.start(1);
|
int matchStart = matcher.start(1);
|
||||||
int matchEnd = matcher.end();
|
int matchEnd = matcher.end();
|
||||||
String url = text.substring(matchStart, matchEnd);
|
String url = text.substring(matchStart, matchEnd);
|
||||||
urlSet.add(url);
|
urlSet.add(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
return urlSet;
|
return urlSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user