findURLs

Finds URLs in a string, returning an array of them. Does not filter out duplicates.

Replacement for regex matching using much less memory when compiling (around ~300mb).

To consider: does this need a dstring?

@safe pure
findURLs
(
const string line
,
const uint max = uint.max
)

Parameters

line string

String line to examine and find URLs in.

max uint

Maximum number of URLs to find. Default is uint.max.

Return Value

Type: auto

A string[] array of found URLs. These include fragment identifiers.

Examples

// Replaces the following:
// enum stephenhay = `\bhttps?://[^\s/$.?#].[^\s]*`;
// static urlRegex = ctRegex!stephenhay;

string[] urls = findURL("blah https://google.com http://facebook.com httpx://wefpokwe");
assert(urls.length == 2);
1 import std.conv : to;
2 
3 {
4     const urls = findURLs("http://google.com");
5     assert((urls.length == 1), urls.to!string);
6     assert((urls[0] == "http://google.com"), urls[0]);
7 }
8 {
9     const urls = findURLs("blah https://a.com http://b.com shttps://c https://d.asdf.asdf.asdf        ");
10     assert((urls.length == 3), urls.to!string);
11     assert((urls == [ "https://a.com", "http://b.com", "https://d.asdf.asdf.asdf" ]), urls.to!string);
12 }
13 {
14     // max 2
15     const urls = findURLs("blah https://a.com http://b.com shttps://c https://d.asdf.asdf.asdf        ", max: 2);
16     assert((urls.length == 2), urls.to!string);
17     assert((urls == [ "https://a.com", "http://b.com" ]), urls.to!string);
18 }
19 {
20     const urls = findURLs("http:// http://asdf https:// asdfhttpasdf http://google.com");
21     assert((urls.length == 1), urls.to!string);
22 }
23 {
24     // max 0
25     const urls = findURLs("http:// http://asdf https:// asdfhttpasdf http://google.com", max: 0);
26     assert(!urls.length, urls.to!string);
27 }
28 {
29     const urls = findURLs("http://a.sehttp://a.shttp://a.http://http:");
30     assert(!urls.length, urls.to!string);
31 }
32 {
33     const urls = findURLs("blahblah https://motorbörsen.se blhblah");
34     assert(urls.length, urls.to!string);
35 }
36 {
37     // Let dlang-requests attempt complex URLs, don't validate more than necessary
38     const urls = findURLs("blahblah https://高所恐怖症。co.jp blhblah");
39     assert(urls.length, urls.to!string);
40 }
41 {
42     const urls = findURLs("nyaa is now at https://nyaa.si, https://nyaa.si? " ~
43         "https://nyaa.si. https://nyaa.si! and you should use it https://nyaa.si:");
44 
45     foreach (immutable url; urls)
46     {
47         assert((url == "https://nyaa.si"), url);
48     }
49 }
50 {
51     const urls = findURLs("https://google.se httpx://google.se https://google.se");
52     assert((urls == [ "https://google.se", "https://google.se" ]), urls.to!string);
53 }
54 {
55     const urls = findURLs("https://               ");
56     assert(!urls.length, urls.to!string);
57 }
58 {
59     const urls = findURLs("http://               ");
60     assert(!urls.length, urls.to!string);
61 }