findURLs

Finds URLs in a string, returning an array of them. Does not filter out duplicates.

Replacement for regex matching using much less memory when compiling (around ~300mb).

To consider: does this need a dstring?

@safe pure
findURLs
(
const string line
)

Parameters

line string

String line to examine and find URLs in.

Return Value

Type: auto

A string[] array of found URLs. These include fragment identifiers.

Examples

// Replaces the following:
// enum stephenhay = `\bhttps?://[^\s/$.?#].[^\s]*`;
// static urlRegex = ctRegex!stephenhay;

string[] urls = findURL("blah https://google.com http://facebook.com httpx://wefpokwe");
assert(urls.length == 2);
import std.conv : to;

{
    const urls = findURLs("http://google.com");
    assert((urls.length == 1), urls.to!string);
    assert((urls[0] == "http://google.com"), urls[0]);
}
{
    const urls = findURLs("blah https://a.com http://b.com shttps://c https://d.asdf.asdf.asdf        ");
    assert((urls.length == 3), urls.to!string);
    assert((urls == [ "https://a.com", "http://b.com", "https://d.asdf.asdf.asdf" ]), urls.to!string);
}
{
    const urls = findURLs("http:// http://asdf https:// asdfhttpasdf http://google.com");
    assert((urls.length == 1), urls.to!string);
}
{
    const urls = findURLs("http://a.sehttp://a.shttp://a.http://http:");
    assert(!urls.length, urls.to!string);
}
{
    const urls = findURLs("blahblah https://motorbörsen.se blhblah");
    assert(urls.length, urls.to!string);
}
{
    // Let dlang-requests attempt complex URLs, don't validate more than necessary
    const urls = findURLs("blahblah https://高所恐怖症。co.jp blhblah");
    assert(urls.length, urls.to!string);
}
{
    const urls = findURLs("nyaa is now at https://nyaa.si, https://nyaa.si? " ~
        "https://nyaa.si. https://nyaa.si! and you should use it https://nyaa.si:");

    foreach (immutable url; urls)
    {
        assert((url == "https://nyaa.si"), url);
    }
}
{
    const urls = findURLs("https://google.se httpx://google.se https://google.se");
    assert((urls == [ "https://google.se", "https://google.se" ]), urls.to!string);
}
{
    const urls = findURLs("https://               ");
    assert(!urls.length, urls.to!string);
}
{
    const urls = findURLs("http://               ");
    assert(!urls.length, urls.to!string);
}