c++ – wait() hangs when CLONE_THREAD

I am tracing some processes and their children using ptrace. I am trying to print specific system call (using Seccomp filter that notifies ptrace, see this blogpost).

In most cases my code (see below) is working fine. However, when I am tracing a java program (from the default-jre package), the latter clones using the CLONE_THREAD flag. And for some reason, my tracer hangs (I believe) because I can’t receive signals from the cloned process. I think the reason is that (according to this discussion) the child process in fact becomes a child of the original process’ parent, instead of becoming the original process’ child.

I reproduced this issue by using a simple program that simply calls clone() with flags and perform actions. When I used the when I use CLONE_THREAD | CLONE_SIGHAND | CLONE_VM flags (as clone() documentation specifies they should come together since Linux 2.6.0), at least I am able to trace everything correctly until one of the two thread finishes.

I would like to trace both thread independently. Is it possible?

More importantly, I need to trace a Java program, and I cannot change it. Here a strace of the Java program clone call:

[...]
4665  clone(child_stack=0x7fb166e95fb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tid=[4666], tls=0x7fb166e96700, child_tidptr=0x7fb166e969d0) = 4666
[...]

So Java seems to respect the rules. I wanted to experiment to understand: I ruled out any flags unrelated to thread (ie, `CLONE_FS | CLONE_FILES | CLONE_SYSVSEM).

Here are the results of running my test program with different combination of flags (I know, I am really desperate):

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS: only gets trace from parent

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_PARENT_SETTID: inconsistent; gets trace from both until the parent finishes

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_CHILD_CLEARTID: inconsistent; gets trace from both until the child finishes

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_PARENT_SETTID: only gets trace from parent

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_CHILD_CLEARTID: only gets trace from parent

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_PARENT_SETTID|CLONE_SETTLS: only gets trace from parent

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID: inconsistent; gets trace from both until the child finishes

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_CHILD_CLEARTID|CLONE_SETTLS: only gets trace from parent

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_CHILD_CLEARTID|CLONE_PARENT_SETTID: inconsistent; gets trace from both until the child finishes

  • CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID: only gets trace from parent

So at least I get the same behavior from my program and the Java program: it does not work.

How can I make it work? For instance, how does strace successfully traces any kind of clone? I tried to dig into its code but I can’t find how they are doing it.

Any help might appreciated! best regards,

The tracer code (compile with g++ tracer.cpp -o tracer -g -lseccomp -lexplain):

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <stddef.h>

#include <sys/ptrace.h>
#include <sys/reg.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/user.h>
#include <sys/prctl.h>
#include <fcntl.h>
#include <linux/limits.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <linux/unistd.h>
#include <libexplain/waitpid.h>
#include <tuple>
#include <vector>


#define DEFAULT_SIZE 1000
#define MAX_SIZE 1000

int process_signals();
int inspect(pid_t);
void read_string_into_buff(const pid_t, unsigned long long, char *, unsigned int);

int main(int argc, char **argv){
  pid_t pid;
  int status;

  if (argc < 2) {
      fprintf(stderr, "Usage: %s <prog> <arg1> ... <argN>n", argv[0]);
      return 1;
  }

  if ((pid = fork()) == 0) {
      /* If execve syscall, trace */
      struct sock_filter filter[] = {
          BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
          BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_getpid, 0, 1),
          BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRACE),
          BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
      };
      struct sock_fprog prog = {
          .len = (unsigned short) (sizeof(filter)/sizeof(filter[0])),
          .filter = filter,
      };
      ptrace(PTRACE_TRACEME, 0, 0, 0);
      if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
            perror("prctl(PR_SET_NO_NEW_PRIVS)");
            return 1;
      }
      if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) {
          perror("when setting seccomp filter");
          return 1;
      }
      kill(getpid(), SIGSTOP);
      return execvp(argv[1], argv + 1);
  } else {
      waitpid(pid, &status, 0);
      ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_TRACESECCOMP | PTRACE_O_TRACEFORK | PTRACE_O_TRACECLONE | PTRACE_O_TRACEVFORK );
      ptrace(PTRACE_CONT, pid, 0, 0);
      process_signals();
      return 0;
  }
}


int process_signals(){
  int status;
  while (1){
    pid_t child_pid;
    // When child status changes
    if ((child_pid = waitpid(-1, &status, 0)) < 0){
      fprintf(stderr, "%sn", explain_waitpid(child_pid, &status, 0));
      exit(EXIT_FAILURE);
    }
    //printf("Sigtrap receivedn");
    // Checking if it is thanks to seccomp
    if (status >> 8 == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8))){
      // Perform argument inspection with ptrace
      int syscall = inspect(child_pid);
    }
    // Resume no matter what
    ptrace(PTRACE_CONT, child_pid, 0, 0);
  }
}

int inspect(pid_t pid){
  printf("From PID: %dn", pid);
  struct user_regs_struct regs;
  ptrace(PTRACE_GETREGS, pid, 0, &regs);
  // Get syscall number
  int syscall = regs.orig_rax;
  printf("------nCaught syscall: %dn", syscall);

  if (syscall == __NR_getpid){
    printf("Getpid detectedn");
  }
  return syscall;
}

void read_string_into_buff(const pid_t pid, unsigned long long addr, char * buff, unsigned int max_len){
  /* Are we aligned on the "start" front? */
  unsigned int offset=((unsigned long)addr)%sizeof(long);
  addr-=offset;
  unsigned int i=0;
  int done=0;
  int word_offset=0;

  while( !done ) {
    unsigned long word=ptrace( PTRACE_PEEKDATA, pid, addr+(word_offset++)*sizeof(long), 0 );
    // While loop to stop at the first '' char indicating end of string
    while( !done && offset<sizeof(long) && i<max_len ) {
      buff[i]=((char *)&word)[offset]; /* Endianity neutral copy */

      done=buff[i]=='';
      ++i;
      ++offset;
    }

    offset=0;
    done=done || i>=max_len;
  }
}

The sample program (compile with gcc sample.c -o sample):

#define _GNU_SOURCE
#include <stdio.h>
#include <sched.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>

#define FLAGS CLONE_VM|CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID

int fn(void *arg)
{
   printf("nINFO: This code is running under child process.n");

   int i = 0;
   int n = atoi(arg);
   for ( i = 1 ; i <= 10 ; i++ )
      printf("[%d] %d * %d = %dn", getpid(), n, i, (n*i));

   printf("n");

   return 0;
}

void main(int argc, char *argv[])
{
   printf("[%d] Hello, World!n", getpid());

   void *pchild_stack = malloc(1024 * 1024);
   if ( pchild_stack == NULL ) {
      printf("ERROR: Unable to allocate memory.n");
      exit(EXIT_FAILURE);
   }

   int pid = clone(fn, pchild_stack + (1024 * 1024), FLAGS, argv[1]);
   if ( pid < 0 ) {
        printf("ERROR: Unable to create the child process.n");
        exit(EXIT_FAILURE);
   }

   fn(argv[1]);

   wait(NULL);

   free(pchild_stack);

   printf("INFO: Child process terminated.n");
}

You can test what you want by running ./tracer ./sample. You can also test the original test case ./tracer java and observe that both the tracer and java hangs.

ANSWER:
As pointed it out in the comment, I had issues in that example that were preventing me from handling signals from the child.

In my original code (not listed here because too complex), I was only attaching ptrace AFTER the processes started… and I was only attaching to PID listed by pstree. My mistake was that I omitted the threads (and java is one program that does create threads), explaining why I had issue tracing java only. I modified the code to attach to all the children process and thread (ps -L -g <Main_PID> -o tid=) and everything works again.

Leave a Comment