builtin-mailinfo.c infrastrcture changes

I am working on a project that required parsing through regular
mboxes that didn't necessarily have patches embedded in them.  I
started by creating my own modified copy of git-am and working
from there.  Very quickly, I noticed git-mailinfo wasn't able to
handle a big chunk of my email.

After hacking up numerous solutions and running into more
limitations, I decided it was just easier to rewrite a big chunk
of it.  The following patch has a bunch of fixes and features
that I needed in order for me do what I wanted.

Note: I'm didn't follow any email rfc papers but I don't think
any of the changes I did required much knowledge (besides the
boundary stuff).

List of major changes/fixes:
- can't create empty patch files fix
- empty patch files don't fail, this failure will come inside git-am
- multipart boundaries are now handled
- only output inbody headers if a patch exists otherwise assume those
headers are part of the reply and instead output the original headers
- decode and filter base64 patches correctly
- various other accidental fixes

I believe I didn't break any existing functionality or
compatibility (other than what I describe above, which is really
only the empty patch file).

I tested this through various mailing list archives and
everything seemed to parse correctly (a couple thousand emails).

[jc: squashed in another patch from Don's five patch series to
 fix the test case, as this patch exposes the bug in the test.]

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Junio C Hamano <junkio@cox.net>
This commit is contained in:
Don Zickus 2007-03-12 15:52:04 -04:00 committed by Junio C Hamano
parent 27ebd6e044
commit 87ab799234
5 changed files with 301 additions and 262 deletions

View File

@ -11,19 +11,22 @@ static FILE *cmitmsg, *patchfile, *fin, *fout;
static int keep_subject; static int keep_subject;
static const char *metainfo_charset; static const char *metainfo_charset;
static char line[1000]; static char line[1000];
static char date[1000];
static char name[1000]; static char name[1000];
static char email[1000]; static char email[1000];
static char subject[1000];
static enum { static enum {
TE_DONTCARE, TE_QP, TE_BASE64, TE_DONTCARE, TE_QP, TE_BASE64,
} transfer_encoding; } transfer_encoding;
static char charset[256]; static enum {
TYPE_TEXT, TYPE_OTHER,
} message_type;
static char multipart_boundary[1000]; static char charset[256];
static int multipart_boundary_len;
static int patch_lines; static int patch_lines;
static char **p_hdr_data, **s_hdr_data;
#define MAX_HDR_PARSED 10
#define MAX_BOUNDARIES 5
static char *sanity_check(char *name, char *email) static char *sanity_check(char *name, char *email)
{ {
@ -137,15 +140,13 @@ static int handle_from(char *in_line)
return 1; return 1;
} }
static int handle_date(char *line) static int handle_header(char *line, char *data, int ofs)
{ {
strcpy(date, line); if (!line || !data)
return 0; return 1;
}
strcpy(data, line+ofs);
static int handle_subject(char *line)
{
strcpy(subject, line);
return 0; return 0;
} }
@ -177,17 +178,32 @@ static int slurp_attr(const char *line, const char *name, char *attr)
return 1; return 1;
} }
static int handle_subcontent_type(char *line) struct content_type {
char *boundary;
int boundary_len;
};
static struct content_type content[MAX_BOUNDARIES];
static struct content_type *content_top = content;
static int handle_content_type(char *line)
{ {
/* We do not want to mess with boundary. Note that we do not char boundary[256];
* handle nested multipart.
*/ if (strcasestr(line, "text/") == NULL)
if (strcasestr(line, "boundary=")) { message_type = TYPE_OTHER;
fprintf(stderr, "Not handling nested multipart message.\n"); if (slurp_attr(line, "boundary=", boundary + 2)) {
exit(1); memcpy(boundary, "--", 2);
if (content_top++ >= &content[MAX_BOUNDARIES]) {
fprintf(stderr, "Too many boundaries to handle\n");
exit(1);
}
content_top->boundary_len = strlen(boundary);
content_top->boundary = xmalloc(content_top->boundary_len+1);
strcpy(content_top->boundary, boundary);
} }
slurp_attr(line, "charset=", charset); if (slurp_attr(line, "charset=", charset)) {
if (*charset) {
int i, c; int i, c;
for (i = 0; (c = charset[i]) != 0; i++) for (i = 0; (c = charset[i]) != 0; i++)
charset[i] = tolower(c); charset[i] = tolower(c);
@ -195,17 +211,6 @@ static int handle_subcontent_type(char *line)
return 0; return 0;
} }
static int handle_content_type(char *line)
{
*multipart_boundary = 0;
if (slurp_attr(line, "boundary=", multipart_boundary + 2)) {
memcpy(multipart_boundary, "--", 2);
multipart_boundary_len = strlen(multipart_boundary);
}
slurp_attr(line, "charset=", charset);
return 0;
}
static int handle_content_transfer_encoding(char *line) static int handle_content_transfer_encoding(char *line)
{ {
if (strcasestr(line, "base64")) if (strcasestr(line, "base64"))
@ -219,7 +224,7 @@ static int handle_content_transfer_encoding(char *line)
static int is_multipart_boundary(const char *line) static int is_multipart_boundary(const char *line)
{ {
return (!memcmp(line, multipart_boundary, multipart_boundary_len)); return (!memcmp(line, content_top->boundary, content_top->boundary_len));
} }
static int eatspace(char *line) static int eatspace(char *line)
@ -230,62 +235,6 @@ static int eatspace(char *line)
return len; return len;
} }
#define SEEN_FROM 01
#define SEEN_DATE 02
#define SEEN_SUBJECT 04
#define SEEN_BOGUS_UNIX_FROM 010
#define SEEN_PREFIX 020
/* First lines of body can have From:, Date:, and Subject: or empty */
static void handle_inbody_header(int *seen, char *line)
{
if (*seen & SEEN_PREFIX)
return;
if (isspace(*line)) {
char *cp;
for (cp = line + 1; *cp; cp++) {
if (!isspace(*cp))
break;
}
if (!*cp)
return;
}
if (!memcmp(">From", line, 5) && isspace(line[5])) {
if (!(*seen & SEEN_BOGUS_UNIX_FROM)) {
*seen |= SEEN_BOGUS_UNIX_FROM;
return;
}
}
if (!memcmp("From:", line, 5) && isspace(line[5])) {
if (!(*seen & SEEN_FROM) && handle_from(line+6)) {
*seen |= SEEN_FROM;
return;
}
}
if (!memcmp("Date:", line, 5) && isspace(line[5])) {
if (!(*seen & SEEN_DATE)) {
handle_date(line+6);
*seen |= SEEN_DATE;
return;
}
}
if (!memcmp("Subject:", line, 8) && isspace(line[8])) {
if (!(*seen & SEEN_SUBJECT)) {
handle_subject(line+9);
*seen |= SEEN_SUBJECT;
return;
}
}
if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) {
if (!(*seen & SEEN_SUBJECT)) {
handle_subject(line);
*seen |= SEEN_SUBJECT;
return;
}
}
*seen |= SEEN_PREFIX;
}
static char *cleanup_subject(char *subject) static char *cleanup_subject(char *subject)
{ {
if (keep_subject) if (keep_subject)
@ -296,7 +245,7 @@ static char *cleanup_subject(char *subject)
switch (*subject) { switch (*subject) {
case 'r': case 'R': case 'r': case 'R':
if (!memcmp("e:", subject+1, 2)) { if (!memcmp("e:", subject+1, 2)) {
subject +=3; subject += 3;
continue; continue;
} }
break; break;
@ -341,57 +290,62 @@ static void cleanup_space(char *buf)
} }
static void decode_header(char *it); static void decode_header(char *it);
typedef int (*header_fn_t)(char *); static char *header[MAX_HDR_PARSED] = {
struct header_def { "From","Subject","Date",
const char *name;
header_fn_t func;
int namelen;
}; };
static void check_header(char *line, struct header_def *header) static int check_header(char *line, char **hdr_data)
{ {
int i; int i;
if (header[0].namelen <= 0) { /* search for the interesting parts */
for (i = 0; header[i].name; i++) for (i = 0; header[i]; i++) {
header[i].namelen = strlen(header[i].name); int len = strlen(header[i]);
} if (!hdr_data[i] &&
for (i = 0; header[i].name; i++) { !strncasecmp(line, header[i], len) &&
int len = header[i].namelen;
if (!strncasecmp(line, header[i].name, len) &&
line[len] == ':' && isspace(line[len + 1])) { line[len] == ':' && isspace(line[len + 1])) {
/* Unwrap inline B and Q encoding, and optionally /* Unwrap inline B and Q encoding, and optionally
* normalize the meta information to utf8. * normalize the meta information to utf8.
*/ */
decode_header(line + len + 2); decode_header(line + len + 2);
header[i].func(line + len + 2); hdr_data[i] = xmalloc(1000 * sizeof(char));
break; if (! handle_header(line, hdr_data[i], len + 2)) {
return 1;
}
} }
} }
}
static void check_subheader_line(char *line) /* Content stuff */
{ if (!strncasecmp(line, "Content-Type", 12) &&
static struct header_def header[] = { line[12] == ':' && isspace(line[12 + 1])) {
{ "Content-Type", handle_subcontent_type }, decode_header(line + 12 + 2);
{ "Content-Transfer-Encoding", if (! handle_content_type(line)) {
handle_content_transfer_encoding }, return 1;
{ NULL }, }
}; }
check_header(line, header); if (!strncasecmp(line, "Content-Transfer-Encoding", 25) &&
} line[25] == ':' && isspace(line[25 + 1])) {
static void check_header_line(char *line) decode_header(line + 25 + 2);
{ if (! handle_content_transfer_encoding(line)) {
static struct header_def header[] = { return 1;
{ "From", handle_from }, }
{ "Date", handle_date }, }
{ "Subject", handle_subject },
{ "Content-Type", handle_content_type }, /* for inbody stuff */
{ "Content-Transfer-Encoding", if (!memcmp(">From", line, 5) && isspace(line[5]))
handle_content_transfer_encoding }, return 1;
{ NULL }, if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) {
}; for (i = 0; header[i]; i++) {
check_header(line, header); if (!memcmp("Subject: ", header[i], 9)) {
if (! handle_header(line, hdr_data[i], 0)) {
return 1;
}
}
}
}
/* no match */
return 0;
} }
static int is_rfc2822_header(char *line) static int is_rfc2822_header(char *line)
@ -647,39 +601,139 @@ static void decode_transfer_encoding(char *line)
} }
} }
static void handle_info(void) static int handle_filter(char *line);
static int find_boundary(void)
{ {
char *sub; while(fgets(line, sizeof(line), fin) != NULL) {
if (is_multipart_boundary(line))
sub = cleanup_subject(subject); return 1;
cleanup_space(name); }
cleanup_space(date); return 0;
cleanup_space(email);
cleanup_space(sub);
fprintf(fout, "Author: %s\nEmail: %s\nSubject: %s\nDate: %s\n\n",
name, email, sub, date);
} }
/* We are inside message body and have read line[] already. static int handle_boundary(void)
* Spit out the commit log.
*/
static int handle_commit_msg(int *seen)
{ {
again:
if (!memcmp(line+content_top->boundary_len, "--", 2)) {
/* we hit an end boundary */
/* pop the current boundary off the stack */
free(content_top->boundary);
/* technically won't happen as is_multipart_boundary()
will fail first. But just in case..
*/
if (content_top-- < content) {
fprintf(stderr, "Detected mismatched boundaries, "
"can't recover\n");
exit(1);
}
handle_filter("\n");
/* skip to the next boundary */
if (!find_boundary())
return 0;
goto again;
}
/* set some defaults */
transfer_encoding = TE_DONTCARE;
charset[0] = 0;
message_type = TYPE_TEXT;
/* slurp in this section's info */
while (read_one_header_line(line, sizeof(line), fin))
check_header(line, p_hdr_data);
/* eat the blank line after section info */
return (fgets(line, sizeof(line), fin) != NULL);
}
static int handle_commit_msg(char *line)
{
static int still_looking = 1;
if (!cmitmsg) if (!cmitmsg)
return 0; return 0;
do {
if (!memcmp("diff -", line, 6) || if (still_looking) {
!memcmp("---", line, 3) || char *cp = line;
!memcmp("Index: ", line, 7)) if (isspace(*line)) {
for (cp = line + 1; *cp; cp++) {
if (!isspace(*cp))
break;
}
if (!*cp)
return 0;
}
if ((still_looking = check_header(cp, s_hdr_data)) != 0)
return 0;
}
if (!memcmp("diff -", line, 6) ||
!memcmp("---", line, 3) ||
!memcmp("Index: ", line, 7)) {
fclose(cmitmsg);
cmitmsg = NULL;
return 1;
}
fputs(line, cmitmsg);
return 0;
}
static int handle_patch(char *line)
{
fputs(line, patchfile);
patch_lines++;
return 0;
}
static int handle_filter(char *line)
{
static int filter = 0;
/* filter tells us which part we left off on
* a non-zero return indicates we hit a filter point
*/
switch (filter) {
case 0:
if (!handle_commit_msg(line))
break; break;
if ((multipart_boundary[0] && is_multipart_boundary(line))) { filter++;
/* We come here when the first part had only case 1:
* the commit message without any patch. We if (!handle_patch(line))
* pretend we have not seen this line yet, and break;
* go back to the loop. filter++;
*/ default:
return 1; return 1;
}
return 0;
}
static void handle_body(void)
{
int rc = 0;
static char newline[2000];
static char *np = newline;
/* Skip up to the first boundary */
if (content_top->boundary) {
if (!find_boundary())
return;
}
do {
/* process any boundary lines */
if (content_top->boundary && is_multipart_boundary(line)) {
/* flush any leftover */
if ((transfer_encoding == TE_BASE64) &&
(np != newline)) {
handle_filter(newline);
}
if (!handle_boundary())
return;
} }
/* Unwrap transfer encoding and optionally /* Unwrap transfer encoding and optionally
@ -689,105 +743,80 @@ static int handle_commit_msg(int *seen)
if (metainfo_charset) if (metainfo_charset)
convert_to_utf8(line, charset); convert_to_utf8(line, charset);
handle_inbody_header(seen, line); switch (transfer_encoding) {
if (!(*seen & SEEN_PREFIX)) case TE_BASE64:
{
char *op = line;
/* binary data most likely doesn't have newlines */
if (message_type != TYPE_TEXT) {
rc = handle_filter(line);
break;
}
/* this is a decoded line that may contain
* multiple new lines. Pass only one chunk
* at a time to handle_filter()
*/
do {
while (*op != '\n' && *op != 0)
*np++ = *op++;
*np = *op;
if (*np != 0) {
/* should be sitting on a new line */
*(++np) = 0;
op++;
rc = handle_filter(newline);
np = newline;
}
} while (*op != 0);
/* the partial chunk is saved in newline and
* will be appended by the next iteration of fgets
*/
break;
}
default:
rc = handle_filter(line);
}
if (rc)
/* nothing left to filter */
break;
} while (fgets(line, sizeof(line), fin));
return;
}
static void handle_info(void)
{
char *sub;
char *hdr;
int i;
for (i = 0; header[i]; i++) {
/* only print inbody headers if we output a patch file */
if (patch_lines && s_hdr_data[i])
hdr = s_hdr_data[i];
else if (p_hdr_data[i])
hdr = p_hdr_data[i];
else
continue; continue;
fputs(line, cmitmsg); if (!memcmp(header[i], "Subject", 7)) {
} while (fgets(line, sizeof(line), fin) != NULL); sub = cleanup_subject(hdr);
fclose(cmitmsg); cleanup_space(sub);
cmitmsg = NULL; fprintf(fout, "Subject: %s\n", sub);
return 0; } else if (!memcmp(header[i], "From", 4)) {
} handle_from(hdr);
fprintf(fout, "Author: %s\n", name);
/* We have done the commit message and have the first fprintf(fout, "Email: %s\n", email);
* line of the patch in line[]. } else {
*/ cleanup_space(hdr);
static void handle_patch(void) fprintf(fout, "%s: %s\n", header[i], hdr);
{
do {
if (multipart_boundary[0] && is_multipart_boundary(line))
break;
/* Only unwrap transfer encoding but otherwise do not
* do anything. We do *NOT* want UTF-8 conversion
* here; we are dealing with the user payload.
*/
decode_transfer_encoding(line);
fputs(line, patchfile);
patch_lines++;
} while (fgets(line, sizeof(line), fin) != NULL);
}
/* multipart boundary and transfer encoding are set up for us, and we
* are at the end of the sub header. do equivalent of handle_body up
* to the next boundary without closing patchfile --- we will expect
* that the first part to contain commit message and a patch, and
* handle other parts as pure patches.
*/
static int handle_multipart_one_part(int *seen)
{
int n = 0;
while (fgets(line, sizeof(line), fin) != NULL) {
again:
n++;
if (is_multipart_boundary(line))
break;
if (handle_commit_msg(seen))
goto again;
handle_patch();
break;
}
if (n == 0)
return -1;
return 0;
}
static void handle_multipart_body(void)
{
int seen = 0;
int part_num = 0;
/* Skip up to the first boundary */
while (fgets(line, sizeof(line), fin) != NULL)
if (is_multipart_boundary(line)) {
part_num = 1;
break;
} }
if (!part_num)
return;
/* We are on boundary line. Start slurping the subhead. */
while (1) {
int hdr = read_one_header_line(line, sizeof(line), fin);
if (!hdr) {
if (handle_multipart_one_part(&seen) < 0)
return;
/* Reset per part headers */
transfer_encoding = TE_DONTCARE;
charset[0] = 0;
}
else
check_subheader_line(line);
}
fclose(patchfile);
if (!patch_lines) {
fprintf(stderr, "No patch found\n");
exit(1);
}
}
/* Non multipart message */
static void handle_body(void)
{
int seen = 0;
handle_commit_msg(&seen);
handle_patch();
fclose(patchfile);
if (!patch_lines) {
fprintf(stderr, "No patch found\n");
exit(1);
} }
fprintf(fout, "\n");
} }
int mailinfo(FILE *in, FILE *out, int ks, const char *encoding, int mailinfo(FILE *in, FILE *out, int ks, const char *encoding,
@ -809,18 +838,16 @@ int mailinfo(FILE *in, FILE *out, int ks, const char *encoding,
fclose(cmitmsg); fclose(cmitmsg);
return -1; return -1;
} }
while (1) {
int hdr = read_one_header_line(line, sizeof(line), fin); p_hdr_data = xcalloc(MAX_HDR_PARSED, sizeof(char *));
if (!hdr) { s_hdr_data = xcalloc(MAX_HDR_PARSED, sizeof(char *));
if (multipart_boundary[0])
handle_multipart_body(); /* process the email header */
else while (read_one_header_line(line, sizeof(line), fin))
handle_body(); check_header(line, p_hdr_data);
handle_info();
break; handle_body();
} handle_info();
check_header_line(line);
}
return 0; return 0;
} }

View File

@ -290,6 +290,10 @@ do
git-mailinfo $keep $utf8 "$dotest/msg" "$dotest/patch" \ git-mailinfo $keep $utf8 "$dotest/msg" "$dotest/patch" \
<"$dotest/$msgnum" >"$dotest/info" || <"$dotest/$msgnum" >"$dotest/info" ||
stop_here $this stop_here $this
test -s $dotest/patch || {
echo "Patch is empty. Was is split wrong?"
stop_here $this
}
git-stripspace < "$dotest/msg" > "$dotest/msg-clean" git-stripspace < "$dotest/msg" > "$dotest/msg-clean"
;; ;;
esac esac

View File

@ -77,6 +77,10 @@ do
*) *)
git-mailinfo $keep_subject $utf8 \ git-mailinfo $keep_subject $utf8 \
.dotest/msg .dotest/patch <$i >.dotest/info || exit 1 .dotest/msg .dotest/patch <$i >.dotest/info || exit 1
test -s $dotest/patch || {
echo "Patch is empty. Was is split wrong?"
stop_here $this
}
git-stripspace < .dotest/msg > .dotest/msg-clean git-stripspace < .dotest/msg > .dotest/msg-clean
;; ;;
esac esac

View File

@ -73,6 +73,10 @@ mkdir $tmp_dir || exit 2
for patch_name in $(cat "$QUILT_PATCHES/series" | grep -v '^#'); do for patch_name in $(cat "$QUILT_PATCHES/series" | grep -v '^#'); do
echo $patch_name echo $patch_name
(cat $QUILT_PATCHES/$patch_name | git-mailinfo "$tmp_msg" "$tmp_patch" > "$tmp_info") || exit 3 (cat $QUILT_PATCHES/$patch_name | git-mailinfo "$tmp_msg" "$tmp_patch" > "$tmp_info") || exit 3
test -s $dotest/patch || {
echo "Patch is empty. Was is split wrong?"
stop_here $this
}
# Parse the author information # Parse the author information
export GIT_AUTHOR_NAME=$(sed -ne 's/Author: //p' "$tmp_info") export GIT_AUTHOR_NAME=$(sed -ne 's/Author: //p' "$tmp_info")

View File

@ -61,7 +61,7 @@ diff --git a/git-cvsimport-script b/git-cvsimport-script
push(@old,$fn); push(@old,$fn);
-- --
David Kågedal David Kågedal
- -
To unsubscribe from this list: send the line "unsubscribe git" in To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@vger.kernel.org the body of a message to majordomo@vger.kernel.org