Intro
This is a pretty dumb idea, but I really just wanted to see how difficult it would be to build and it turned out that its actually not that bad. I've seen lots of spammer/scammer/wtf knows--shady domains that have made me wonder if they were the product of some kind of stochastic process... and I was reminded of domain generation algorithms much like the obnoxious conficker domains that used to plague the internet back in ~2008/2009.
Spam E-mail
Open up your spam folder and look at the origin address for a lot of the crap in there, chances are you are receiving spam e-mail from some bot that is using a shit domain that looks like either an intentional misspelling or a grapheme word.
A rudimentary proof of concept
#!/usr/bin/env python3
import datetime
# ----------------------------
# MASSIVELY EXPANDED COMPONENTS (NOW WITH HUNDREDS MORE FROM LINGUISTIC SOURCES)
# ----------------------------
PREFIXES = [
"", "un", "re", "pre", "post", "over", "under", "hyper", "ultra", "mega", "giga", "anti", "de", "dis", "en", "em", "in", "im",
"il", "ir", "non", "sub", "super", "trans", "ex", "mis", "mal", "fore", "mid", "out", "up", "down", "semi", "hemi", "demi",
"omni", "pan", "proto", "neo", "paleo", "pseudo", "auto", "bio", "cyber", "crypto", "electro", "geo", "hydro", "thermo",
"aero", "micro", "nano", "tele", "retro", "contra", "counter", "inter", "intra", "extra", "peri", "circum", "para", "a",
"ab", "ad", "ante", "bi", "co", "con", "com", "cor", "epi", "hypo", "kilo", "macro", "meta", "mini", "mono", "multi",
"ob", "oc", "of", "op", "poly", "pro", "quad", "quasi", "sur", "syn", "sy", "syl", "sym", "tri", "uni", "vice", "with",
"be", "for", "gain", "twain", "twi", "ambi", "amphi", "ana", "apo", "cata", "dia", "eso", "eu", "hetero", "homo", "iso",
"kata", "mono", "oligo", "ortho", "panto", "sym", "tele", "xeno", "zoo"
]
ROOTS = [
# Original + previous expansions
"pal", "fin", "triv", "gran", "mell", "dor", "stel", "clo", "brel", "vint", "lop", "sand", "norl", "wel", "cren", "zyl",
"thor", "vex", "nyx", "lumin", "umbr", "void", "aether", "quor", "xel", "drak", "fyr", "glac", "storm", "necr", "astr",
"chron", "vort", "flux", "syn", "grav", "torr", "blaz", "frost", "shade", "echo", "mir", "spectr", "phas", "ion", "terr",
"aqu", "flor", "sylv", "arbor", "petr", "cryst", "aur", "sol", "lun", "stell", "nov", "ecl", "orb", "ray", "beam", "spark",
"bolt", "wave", "tide", "byt", "cod", "dat", "net", "grid", "node", "link", "core", "plex", "hack", "shard", "rift",
"cache", "stack", "queue", "thread", "loop", "kernel", "shell", "aud", "bene", "dict",
# Added from common root word lists (Greek, Latin, English origins)
"act", "ag", "alt", "am", "anim", "annu", "anthrop", "arch", "aster", "audi", "bell", "biblio", "brev", "cad", "cap",
"carn", "ced", "centr", "cept", "cert", "cid", "circ", "cis", "clam", "clin", "cogn", "cord", "corp", "cred", "cruc",
"crypt", "cub", "cur", "cycl", "dem", "derm", "dex", "doc", "domin", "duc", "dur", "ego", "equ", "fac", "fer", "fid",
"fig", "flect", "flu", "form", "fort", "fract", "frag", "fug", "fus", "gam", "gen", "gest", "gnos", "grad", "graph",
"greg", "gyn", "hab", "hema", "heter", "hom", "hydr", "ject", "jud", "junct", "jur", "lab", "later", "laud", "lav",
"leg", "lev", "liber", "lingu", "liter", "loc", "log", "loqu", "luc", "lud", "lum", "lun", "magn", "mal", "man", "mand",
"mar", "mater", "medi", "mem", "ment", "merg", "meter", "migr", "min", "mir", "mit", "mob", "mon", "mor", "morph",
"mort", "mot", "mut", "nas", "nat", "nav", "nect", "neg", "neur", "nom", "nov", "numer", "onym", "oper", "opt", "ora",
"orn", "oss", "pac", "path", "pater", "ped", "pel", "pend", "phon", "photo", "phys", "plac", "plaud", "plen", "plic",
"plu", "pneum", "pod", "pon", "pop", "port", "pos", "pot", "prim", "prob", "psych", "pug", "punct", "quer", "ques",
"qui", "rad", "rap", "rect", "reg", "rid", "ris", "rog", "rupt", "sacr", "san", "sat", "sci", "scop", "scrib", "script",
"sec", "sed", "sem", "sen", "sent", "sequ", "serv", "sign", "simil", "sol", "solv", "somn", "son", "soph", "spec",
"sphere", "spir", "sta", "stell", "stit", "string", "stru", "surg", "tact", "tang", "techn", "temp", "ten", "tend",
"terr", "test", "the", "therm", "tom", "tort", "tract", "trib", "trud", "turb", "umbr", "urb", "vac", "val", "ven",
"ver", "verb", "vers", "vert", "vi", "vid", "vis", "viv", "voc", "volv", "vor", "zo"
]
SUFFIXES = [
"", "ing", "ful", "less", "ment", "ly", "er", "ion", "en", "ity", "ty", "ness", "able", "ible", "al", "an", "ative",
"ed", "ize", "tion", "ation", "ive", "itive", "or", "ist", "ism", "ship", "hood", "dom", "y", "ous", "ic", "ish",
"like", "ward", "wise", "th", "s", "es", "ant", "ent", "ary", "ory", "ery", "age", "ance", "ence", "cy", "let", "ling",
"ette", "ess", "fold", "most", "proof", "scape", "some", "ster", "ways", "ible", "ify", "ise", "ize", "logy", "pathy",
"phobia", "scope", "cracy", "crat", "gram", "graph", "meter", "ology", "onym", "phone", "speak", "plast", "ibility",
"kinesis", "polis", "mobile", "son"
]
# ----------------------------
# Simple PRNG
# ----------------------------
def next_int(n):
n ^= (n << 13) & 0xFFFFFFFFFFFFFFFF
n ^= (n >> 17)
n ^= (n << 5) & 0xFFFFFFFFFFFFFFFF
return n & 0xFFFFFFFFFFFFFFFF
def pick(n, lst):
return lst[n % len(lst)]
# ----------------------------
# Generate the hourly word
# ----------------------------
def hourly_word(dt):
seed = dt.year * 1000000 + dt.month * 10000 + dt.day * 100 + dt.hour
seed = next_int(seed)
prefix = pick(seed, PREFIXES)
seed = next_int(seed)
root1 = pick(seed, ROOTS)
seed = next_int(seed)
if seed % 3 == 0: # 1-in-3 chance of second root for extra spice
seed = next_int(seed)
root2 = pick(seed, ROOTS)
else:
root2 = ""
seed = next_int(seed)
suffix = pick(seed, SUFFIXES)
word = prefix
if len(root1) <= 3 and root2:
word += root1 + "-" + root2
else:
word += root1 + root2
word += suffix
return word.capitalize() # looks cooler as a proper noun
# ----------------------------
# Print exactly ONE word for right now (UTC)
# ----------------------------
if __name__ == "__main__":
now = datetime.datetime.utcnow().replace(minute=0, second=0, microsecond=0)
print(hourly_word(now))
The results are in
I picked a few that I thought were funny:
- unthreadmanology
- metascopible
- homogynscope
- xenotestling
- thermomorphor
- ilanthropative
This one is really good:
- abmirable
Because if people are anything like me than it's not unimaginable that they might read it as admirable and not even realize its misspelled.
A few from my inbox
- croweakness.nl
- britingen.de
- porporative.com.co
- autoespacio.com
You would think "oh this one sucks":
- norelgreirop.com
however it still a low entropy word;
Detecting randomness in English text uses Shannon entropy to measure unpredictability. Natural English has low entropy (0.6–1.3 bits/character) due to predictable letter patterns (e.g., 'q' followed by 'u'), while random strings (e.g., "q3;F9*j!") have high entropy. Higher entropy signifies greater randomness, whereas low entropy indicates structure, predictability, and low information content.
I get the impression part of the reason why is to beat spam filters which use a similar classifier:
import math
from collections import Counter
def calculate_word_entropy(word):
"""
Calculates the Shannon entropy of a word based on its characters.
A higher value indicates more randomness (gibberish).
"""
if not word:
return 0.0
# Count frequency of each character
char_counts = Counter(word)
total_chars = len(word)
# Apply Shannon Entropy formula: H = -Σ p(x) log2(p(x))
entropy = -sum((count / total_chars) * math.log2(count / total_chars)
for count in char_counts.values())
return round(entropy, 2)
# Comparison Example
test_data = {
"Real Words": ["banana", "apple", "excellent"],
"Gibberish": ["qwerty", "pzxqwj", "xdrtfv"]
}
print(f"{'Category':<12} | {'Word':<12} | {'Entropy':<8}")
print("-" * 38)
for category, words in test_data.items():
for word in words:
score = calculate_word_entropy(word)
print(f"{category:<12} | {word:<12} | {score:<8}")
Final thoughts
I hope you won't actually use this for something you know.. there's enough spam on the internet as it is. I'm sure you won't I mean actually making something that is genuinely worthy of concern with this is pretty far fetched. Among some of the things that you could do to improve it:
-
Grammar rules; back-off/retry/reselect logic for branches that won't work (changes every hour, so every hour should generate something that meets grammar rule requirements.)
-
Not only can you make up fake words but you can give them different meaning depnding on what top-level domain you pair it with potentially. Something that isn't necessarily relevant but comes to mind is the prospect of a phonemic orthography. Etymology is interesting.
-
There's probably something to be said for the bastardization that is punycode and IDN as far as this is concerned. I haven't really looked into how spam filters treat these kinds of domains (AFAIK .com domains likely get preferential treatment.) It reminds me of things like the Voynich Manuscript which I have personally enjoyed many hours looking through and reading about. There is something about it that is irresistible.