mirror of
https://github.com/lz4/lz4.git
synced 2024-11-23 18:04:05 +08:00
367 lines
13 KiB
C
367 lines
13 KiB
C
/*
|
|
lorem.c - lorem ipsum generator
|
|
Copyright (C) Yann Collet 2024
|
|
|
|
GPL v2 License
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
You can contact the author at :
|
|
- LZ4 source repository : https://github.com/lz4/lz4
|
|
- Public forum : https://groups.google.com/forum/#!forum/lz4c
|
|
*/
|
|
|
|
/* Implementation notes:
|
|
*
|
|
* This is a very simple lorem ipsum generator
|
|
* which features a static list of words
|
|
* and print them one after another randomly
|
|
* with a fake sentence / paragraph structure.
|
|
*
|
|
* The goal is to generate a printable text
|
|
* that can be used to fake a text compression scenario.
|
|
* The resulting compression / ratio curve of the lorem ipsum generator
|
|
* is more satisfying than the previous statistical generator,
|
|
* which was initially designed for entropy compression,
|
|
* and lacks a regularity more representative of text.
|
|
*
|
|
* The compression ratio achievable on the generated lorem ipsum
|
|
* is still a bit too good, presumably because the dictionary is a bit too
|
|
* small. It would be possible to create some more complex scheme, notably by
|
|
* enlarging the dictionary with a word generator, and adding grammatical rules
|
|
* (composition) and syntax rules. But that's probably overkill for the intended
|
|
* goal.
|
|
*/
|
|
|
|
#include "lorem.h"
|
|
#include <assert.h>
|
|
#include <limits.h> /* INT_MAX */
|
|
#include <stdlib.h> /* malloc, abort */
|
|
#include <string.h> /* memcpy */
|
|
|
|
/* Define the word pool
|
|
* Note: all words must have a len <= 16 */
|
|
static const char* kWords[] = {
|
|
"lorem", "ipsum", "dolor", "sit", "amet",
|
|
"consectetur", "adipiscing", "elit", "sed", "do",
|
|
"eiusmod", "tempor", "incididunt", "ut", "labore",
|
|
"et", "dolore", "magna", "aliqua", "dis",
|
|
"lectus", "vestibulum", "mattis", "ullamcorper", "velit",
|
|
"commodo", "a", "lacus", "arcu", "magnis",
|
|
"parturient", "montes", "nascetur", "ridiculus", "mus",
|
|
"mauris", "nulla", "malesuada", "pellentesque", "eget",
|
|
"gravida", "in", "dictum", "non", "erat",
|
|
"nam", "voluptat", "maecenas", "blandit", "aliquam",
|
|
"etiam", "enim", "lobortis", "scelerisque", "fermentum",
|
|
"dui", "faucibus", "ornare", "at", "elementum",
|
|
"eu", "facilisis", "odio", "morbi", "quis",
|
|
"eros", "donec", "ac", "orci", "purus",
|
|
"turpis", "cursus", "leo", "vel", "porta",
|
|
"consequat", "interdum", "varius", "vulputate", "aliquet",
|
|
"pharetra", "nunc", "auctor", "urna", "id",
|
|
"metus", "viverra", "nibh", "cras", "mi",
|
|
"unde", "omnis", "iste", "natus", "error",
|
|
"perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
|
|
"totam", "rem", "aperiam", "eaque", "ipsa",
|
|
"quae", "ab", "illo", "inventore", "veritatis",
|
|
"quasi", "architecto", "beatae", "vitae", "dicta",
|
|
"sunt", "explicabo", "nemo", "ipsam", "quia",
|
|
"voluptas", "aspernatur", "aut", "odit", "fugit",
|
|
"consequuntur", "magni", "dolores", "eos", "qui",
|
|
"ratione", "sequi", "nesciunt", "neque", "porro",
|
|
"quisquam", "est", "dolorem", "adipisci", "numquam",
|
|
"eius", "modi", "tempora", "incidunt", "magnam",
|
|
"quaerat", "ad", "minima", "veniam", "nostrum",
|
|
"ullam", "corporis", "suscipit", "laboriosam", "nisi",
|
|
"aliquid", "ex", "ea", "commodi", "consequatur",
|
|
"autem", "eum", "iure", "voluptate", "esse",
|
|
"quam", "nihil", "molestiae", "illum", "fugiat",
|
|
"quo", "pariatur", "vero", "accusamus", "iusto",
|
|
"dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
|
|
"deleniti", "atque", "corrupti", "quos", "quas",
|
|
"molestias", "excepturi", "sint", "occaecati", "cupiditate",
|
|
"provident", "similique", "culpa", "officia", "deserunt",
|
|
"mollitia", "animi", "laborum", "dolorum", "fuga",
|
|
"harum", "quidem", "rerum", "facilis", "expedita",
|
|
"distinctio", "libero", "tempore", "cum", "soluta",
|
|
"nobis", "eligendi", "optio", "cumque", "impedit",
|
|
"minus", "quod", "maxime", "placeat", "facere",
|
|
"possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
|
|
"officiis", "debitis", "saepe", "eveniet", "voluptates",
|
|
"repudiandae", "recusandae", "itaque", "earum", "hic",
|
|
"tenetur", "sapiente", "delectus", "reiciendis", "cillum",
|
|
"maiores", "alias", "perferendis", "doloribus", "asperiores",
|
|
"repellat", "minim", "nostrud", "exercitation", "ullamco",
|
|
"laboris", "aliquip", "duis", "aute", "irure",
|
|
};
|
|
#define KNBWORDS (sizeof(kWords) / sizeof(kWords[0]))
|
|
static const unsigned kNbWords = KNBWORDS;
|
|
|
|
static const char* g_words[KNBWORDS] = { NULL };
|
|
static unsigned g_wordLen[KNBWORDS] = {0};
|
|
static char* g_wordBuffer = NULL;
|
|
|
|
/* simple 1-dimension distribution, based on word's length, favors small words
|
|
*/
|
|
static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
|
|
static const unsigned kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
|
|
|
|
#define DISTRIB_SIZE_MAX 650
|
|
static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
|
|
static unsigned g_distribCount = 0;
|
|
|
|
static void countFreqs(
|
|
const unsigned wordLen[],
|
|
size_t nbWords,
|
|
const int* weights,
|
|
unsigned long nbWeights)
|
|
{
|
|
unsigned total = 0;
|
|
size_t w;
|
|
for (w = 0; w < nbWords; w++) {
|
|
size_t len = wordLen[w];
|
|
int lmax;
|
|
if (len >= nbWeights)
|
|
len = nbWeights - 1;
|
|
lmax = weights[len];
|
|
total += (unsigned)lmax;
|
|
}
|
|
g_distribCount = total;
|
|
assert(g_distribCount <= DISTRIB_SIZE_MAX);
|
|
}
|
|
|
|
static void init_word_len(
|
|
const char* words[],
|
|
size_t nbWords)
|
|
{
|
|
size_t n;
|
|
assert(words != NULL);
|
|
for (n=0; n<nbWords; n++) {
|
|
assert(words[n] != NULL);
|
|
assert(strlen(words[n]) < 256);
|
|
g_wordLen[n] = (unsigned char)strlen(words[n]);
|
|
}
|
|
|
|
}
|
|
|
|
static size_t sumLen(const unsigned* sizes, size_t s)
|
|
{
|
|
size_t total = 0;
|
|
size_t n;
|
|
assert(sizes != NULL);
|
|
for (n=0; n<s; n++) {
|
|
total += sizes[n];
|
|
}
|
|
return total;
|
|
}
|
|
|
|
static void init_word_buffer(void)
|
|
{
|
|
size_t n;
|
|
size_t const bufSize = sumLen(g_wordLen, kNbWords) + 16;
|
|
char* ptr;
|
|
assert(g_wordBuffer == NULL);
|
|
g_wordBuffer = (char*)calloc(1, bufSize);
|
|
if (g_wordBuffer == NULL) abort();
|
|
ptr = g_wordBuffer;
|
|
for (n=0; n<kNbWords; n++) {
|
|
memcpy(ptr, kWords[n], g_wordLen[n]);
|
|
g_words[n] = ptr;
|
|
ptr += g_wordLen[n];
|
|
}
|
|
}
|
|
|
|
static void init_word_distrib(
|
|
const unsigned wordLen[],
|
|
size_t nbWords,
|
|
const int* weights,
|
|
unsigned long nbWeights)
|
|
{
|
|
size_t w, d = 0;
|
|
countFreqs(wordLen, nbWords, weights, nbWeights);
|
|
for (w = 0; w < nbWords; w++) {
|
|
size_t len = wordLen[w];
|
|
int l, lmax;
|
|
if (len >= nbWeights)
|
|
len = nbWeights - 1;
|
|
lmax = weights[len];
|
|
for (l = 0; l < lmax; l++) {
|
|
g_distrib[d++] = (int)w;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Note: this unit only works when invoked sequentially.
|
|
* No concurrent access is allowed */
|
|
static char* g_ptr = NULL;
|
|
static size_t g_nbChars = 0;
|
|
static size_t g_maxChars = 10000000;
|
|
static unsigned g_randRoot = 0;
|
|
|
|
#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
|
|
static unsigned LOREM_rand(unsigned range)
|
|
{
|
|
static const unsigned prime1 = 2654435761U;
|
|
static const unsigned prime2 = 2246822519U;
|
|
unsigned rand32 = g_randRoot;
|
|
rand32 *= prime1;
|
|
rand32 ^= prime2;
|
|
rand32 = RDG_rotl32(rand32, 13);
|
|
g_randRoot = rand32;
|
|
return (unsigned)(((unsigned long long)rand32 * range) >> 32);
|
|
}
|
|
|
|
static void writeLastCharacters(void)
|
|
{
|
|
size_t lastChars = g_maxChars - g_nbChars;
|
|
assert(g_maxChars >= g_nbChars);
|
|
if (lastChars == 0)
|
|
return;
|
|
g_ptr[g_nbChars++] = '.';
|
|
if (lastChars > 2) {
|
|
memset(g_ptr + g_nbChars, ' ', lastChars - 2);
|
|
}
|
|
if (lastChars > 1) {
|
|
g_ptr[g_maxChars - 1] = '\n';
|
|
}
|
|
g_nbChars = g_maxChars;
|
|
}
|
|
|
|
static void generateLastWord(const char* word, size_t wordLen, int upCase)
|
|
{
|
|
if (g_nbChars + wordLen + 2 > g_maxChars) {
|
|
writeLastCharacters();
|
|
return;
|
|
}
|
|
memcpy(g_ptr + g_nbChars, word, wordLen);
|
|
if (upCase) {
|
|
static const char toUp = 'A' - 'a';
|
|
g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
|
|
}
|
|
g_nbChars += wordLen;
|
|
writeLastCharacters();
|
|
}
|
|
|
|
#define MAX(a,b) ((a)<(b)?(b):(a))
|
|
static void generateWord(const char* word, size_t wordLen, const char* separator, size_t sepLen, int upCase)
|
|
{
|
|
size_t const wlen = MAX(16, wordLen + 2);
|
|
if (g_nbChars + wlen > g_maxChars) {
|
|
generateLastWord(word, wordLen, upCase);
|
|
return;
|
|
}
|
|
assert(wordLen <= 16);
|
|
memcpy(g_ptr + g_nbChars, word, 16);
|
|
if (upCase) {
|
|
static const char toUp = 'A' - 'a';
|
|
g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
|
|
}
|
|
g_nbChars += wordLen;
|
|
assert(sepLen <= 2);
|
|
memcpy(g_ptr + g_nbChars, separator, 2);
|
|
g_nbChars += sepLen;
|
|
}
|
|
|
|
static int about(unsigned target)
|
|
{
|
|
return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
|
|
}
|
|
|
|
/* Function to generate a random sentence */
|
|
static void generateSentence(int nbWords)
|
|
{
|
|
int commaPos = about(9);
|
|
int comma2 = commaPos + about(7);
|
|
int qmark = (LOREM_rand(11) == 7);
|
|
const char* endSep = qmark ? "? " : ". ";
|
|
int i;
|
|
for (i = 0; i < nbWords; i++) {
|
|
int const wordID = g_distrib[LOREM_rand(g_distribCount)];
|
|
const char* sep = " ";
|
|
size_t sepLen = 1;
|
|
if (i == commaPos)
|
|
sep = ", ", sepLen=2;
|
|
if (i == comma2)
|
|
sep = ", ", sepLen=2;
|
|
if (i == nbWords - 1)
|
|
sep = endSep, sepLen=2;
|
|
generateWord(g_words[wordID], g_wordLen[wordID], sep, sepLen, i == 0);
|
|
}
|
|
}
|
|
|
|
static void generateParagraph(int nbSentences)
|
|
{
|
|
int i;
|
|
for (i = 0; i < nbSentences; i++) {
|
|
int wordsPerSentence = about(11);
|
|
generateSentence(wordsPerSentence);
|
|
}
|
|
if (g_nbChars < g_maxChars) {
|
|
g_ptr[g_nbChars++] = '\n';
|
|
}
|
|
if (g_nbChars < g_maxChars) {
|
|
g_ptr[g_nbChars++] = '\n';
|
|
}
|
|
}
|
|
|
|
/* It's "common" for lorem ipsum generators to start with the same first
|
|
* pre-defined sentence */
|
|
static void generateFirstSentence(void)
|
|
{
|
|
int i;
|
|
for (i = 0; i < 18; i++) {
|
|
const char* separator = " ";
|
|
size_t sepLen = 1;
|
|
if (i == 4)
|
|
separator = ", ", sepLen=2;
|
|
if (i == 7)
|
|
separator = ", ", sepLen=2;
|
|
generateWord(g_words[i], g_wordLen[i], separator, sepLen, i == 0);
|
|
}
|
|
generateWord(g_words[18], g_wordLen[18], ". ", 2, 0);
|
|
}
|
|
|
|
size_t
|
|
LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
|
|
{
|
|
g_ptr = (char*)buffer;
|
|
assert(size < INT_MAX);
|
|
g_maxChars = size;
|
|
g_nbChars = 0;
|
|
g_randRoot = seed;
|
|
if (g_distribCount == 0) {
|
|
init_word_len(kWords, kNbWords);
|
|
init_word_buffer();
|
|
init_word_distrib(g_wordLen, kNbWords, kWeights, kNbWeights);
|
|
}
|
|
|
|
if (first) {
|
|
generateFirstSentence();
|
|
}
|
|
while (g_nbChars < g_maxChars) {
|
|
int sentencePerParagraph = about(7);
|
|
generateParagraph(sentencePerParagraph);
|
|
if (!fill)
|
|
break; /* only generate one paragraph in not-fill mode */
|
|
}
|
|
g_ptr = NULL;
|
|
return g_nbChars;
|
|
}
|
|
|
|
void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
|
|
{
|
|
LOREM_genBlock(buffer, size, seed, 1, 1);
|
|
}
|