Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Flaky Tests #3546

Open
stephenroller opened this issue Mar 23, 2021 · 8 comments
Open

Flaky Tests #3546

stephenroller opened this issue Mar 23, 2021 · 8 comments

Comments

@stephenroller
Copy link
Contributor

Please comment or +1 list your flaky tests. Don't just say "gpu tests", name specifics failing

@spencerp
Copy link
Contributor

unittests_gpu17 on "Install opencv, vqa-maskrcnn-benchmark"
Often fails with fatal: unable to access 'https://gitlab.com/vedanuj/vqa-maskrcnn-benchmark.git/': The requested URL returned error: 502 and succeeds when re-running.
Example:
https://app.circleci.com/pipelines/github/facebookresearch/ParlAI/8984/workflows/f00f8441-16ad-42fc-8d87-f2a0b297b29c/jobs/73296

@stephenroller
Copy link
Contributor Author

self = <tests.test_transformers.TestTransformerGenerator testMethod=test_beamdelay>

    def test_beamdelay(self):
        """
        Test delayedbeam generation.
        """
        # Delayed Beam is inherently stochastic, just ensure no crash.
        opt = ParlaiParser(True, True).parse_kwargs(
            model_file='zoo:unittest/transformer_generator2/model',
            inference='delayedbeam',
            topk=10,
            beam_delay=2,
            beam_min_length=2,
        )
        agent = create_agent(opt, True)
        agent.observe({'text': '1\n1\n2\n2\n3\n3\n4', 'episode_done': True})
        result = agent.act()
        assert 'text' in result
        assert result['text'] != ''
>       assert '1 2' in result['text']
E       AssertionError: assert '1 2' in '1 3 2 3'

@stephenroller
Copy link
Contributor Author

self = <test_bert.TestBertModel testMethod=test_crossencoder>

    @testing_utils.retry(ntries=3, log_retry=True)
    def test_crossencoder(self):
        valid, test = testing_utils.train_model(
            dict(
                task='convai2',
                model='bert_ranker/cross_encoder_ranker',
                num_epochs=0.002,
                batchsize=1,
                candidates="inline",
                type_optimization="all_encoder_layers",
                warmup_updates=100,
                text_truncate=32,
                label_truncate=32,
                validation_max_exs=20,
                short_final_eval=True,
            )
        )
        # The cross encoder reaches an interesting state MUCH faster
        # accuracy should be present and somewhere between 0.2 and 0.8
        # (large interval so that it doesn't flake.)
>       self.assertGreaterEqual(test['accuracy'], 0.03)
E       AssertionError: ExactMatchMetric(0) not greater than or equal to 0.03

tests/nightly/gpu/test_bert.py:59: AssertionError

@klshuster
Copy link
Contributor

self = <test_model_chat.TestModelChat testMethod=test_base_task>

            def test_base_task(self):
    
                with testing_utils.tempdir() as tmpdir:
    
                    # Paths
                    expected_states_folder = os.path.join(
                        os.path.dirname(os.path.abspath(__file__)), 'expected_states'
                    )
                    expected_chat_data_path = os.path.join(
                        expected_states_folder, 'final_chat_data.json'
                    )
                    expected_state_path = os.path.join(expected_states_folder, 'state.json')
                    model_opt_path = os.path.join(tmpdir, 'model_opts.yaml')
                    chat_data_folder = os.path.join(tmpdir, 'final_chat_data')
    
                    # Create a model opt file for the fixed-response model
                    with open(model_opt_path, 'w') as f:
                        model_opt_contents = f"""\
    fixed_response: >
        --model fixed_response
    """
                        f.write(model_opt_contents)
    
                    # Set up the config and database
                    num_blender_convos = 10
                    args = ModelChatBlueprintArgs()
                    overrides = [
                        f'+mephisto.blueprint.{key}={val}'
                        for key, val in args.__dict__.items()
                        if key
                        in [
                            'max_onboard_time',
                            'max_resp_time',
                            'override_opt',
                            'random_seed',
                            'world_file',
                        ]
                    ] + [
                        'mephisto.blueprint.annotations_config_path=${task_dir}/task_config/annotations_config.json',
                        f'mephisto.blueprint.conversations_needed_string=\"fixed_response:{num_blender_convos:d}\"',
                        f'mephisto.blueprint.chat_data_folder={chat_data_folder}',
                        '+mephisto.blueprint.left_pane_text_path=${task_dir}/task_config/left_pane_text.html',
                        '+mephisto.blueprint.max_concurrent_responses=1',
                        f'mephisto.blueprint.model_opt_path={model_opt_path}',
                        f'+mephisto.blueprint.num_conversations={num_blender_convos:d}',
                        '+mephisto.blueprint.onboard_task_data_path=${task_dir}/task_config/onboard_task_data.json',
                        '+mephisto.blueprint.task_description_file=${task_dir}/task_config/task_description.html',
                    ]
                    # TODO: remove all of these params once Hydra 1.1 is released with
                    #  support for recursive defaults
                    self._set_up_config(
                        blueprint_type=BLUEPRINT_TYPE,
                        task_directory=TASK_DIRECTORY,
                        overrides=overrides,
                    )
    
                    # Set up the operator and server
                    shared_state = SharedModelChatTaskState(world_module=world_module)
                    self._set_up_server(shared_state=shared_state)
    
                    # Check that the agent states are as they should be
                    self._get_channel_info().job.task_runner.task_run.get_blueprint().use_onboarding = (
                        False
                    )
                    # Don't require onboarding for this test agent
                    with open(expected_state_path) as f:
                        expected_state = json.load(f)
                    self._test_agent_states(
                        num_agents=1,
                        agent_display_ids=AGENT_DISPLAY_IDS,
                        agent_messages=AGENT_MESSAGES,
                        form_messages=FORM_MESSAGES,
                        form_task_data=FORM_TASK_DATA,
                        expected_states=(expected_state,),
>                       agent_task_data=AGENT_TASK_DATA,
                    )

tests/crowdsourcing/tasks/model_chat/test_model_chat.py:159: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
parlai/crowdsourcing/utils/tests.py:264: in _test_agent_states
    agent_ids = self._register_mock_agents(num_agents=num_agents)
parlai/crowdsourcing/utils/tests.py:170: in _register_mock_agents
    agents = self.db.find_agents()
../Mephisto/mephisto/abstractions/databases/local_database.py:1076: in find_agents
    return [Agent(self, str(r["agent_id"]), row=r) for r in rows]
../Mephisto/mephisto/abstractions/databases/local_database.py:1076: in <listcomp>
    return [Agent(self, str(r["agent_id"]), row=r) for r in rows]
../Mephisto/mephisto/abstractions/providers/mock/mock_agent.py:31: in __init__
    super().__init__(db, db_id, row=row)
../Mephisto/mephisto/data_model/agent.py:73: in __init__
    self.state = AgentState(self)  # type: ignore
../Mephisto/mephisto/abstractions/blueprints/parlai_chat/parlai_chat_agent_state.py:37: in __init__
    self.load_data()
../Mephisto/mephisto/abstractions/blueprints/parlai_chat/parlai_chat_agent_state.py:72: in load_data
    state = json.load(state_json)
/usr/local/lib/python3.7/json/__init__.py:296: in load
    parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
/usr/local/lib/python3.7/json/__init__.py:348: in loads
    return _default_decoder.decode(s)
/usr/local/lib/python3.7/json/decoder.py:337: in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <json.decoder.JSONDecoder object at 0x7f119dbded90>, s = '', idx = 0

    def raw_decode(self, s, idx=0):
        """Decode a JSON document from ``s`` (a ``str`` beginning with
        a JSON document) and return a 2-tuple of the Python
        representation and the index in ``s`` where the document ended.
    
        This can be used to decode a JSON document from a string that may
        have extraneous data at the end.
    
        """
        try:
            obj, end = self.scan_once(s, idx)
        except StopIteration as err:
>           raise JSONDecodeError("Expecting value", s, err.value) from None
E           json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

/usr/local/lib/python3.7/json/decoder.py:355: JSONDecodeError

@klshuster
Copy link
Contributor

self = <tests.test_distributed.TestDistributedEval testMethod=test_mp_eval>

    def test_mp_eval(self):
        args = dict(
            task='integration_tests:multiturn_nocandidate',
            model='seq2seq',
            model_file='zoo:unittest/seq2seq/model',
            dict_file='zoo:unittest/seq2seq/model.dict',
            skip_generation=False,
            batchsize=8,
        )
        valid, _ = testing_utils.eval_model(args, skip_test=True)
    
        from parlai.scripts.multiprocessing_eval import MultiProcessEval
    
>       valid_mp = MultiProcessEval.main(**args)

tests/test_distributed.py:218: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
parlai/core/script.py:126: in main
    return cls._run_kwargs(kwargs)
parlai/core/script.py:92: in _run_kwargs
    return cls._run_from_parser_and_opt(opt, parser)
parlai/core/script.py:107: in _run_from_parser_and_opt
    return script.run()
parlai/scripts/multiprocessing_eval.py:92: in run
    return launch_and_eval(self.opt, port)
parlai/scripts/multiprocessing_eval.py:66: in launch_and_eval
    retval = multiprocess_eval(0, opt, port)
parlai/scripts/multiprocessing_eval.py:44: in multiprocess_eval
    rank, opt, rank_offset, gpu, init_method=init_method
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/contextlib.py:112: in __enter__
    return next(self.gen)
parlai/utils/distributed.py:282: in distributed_context
    rank=rank,
../venv/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py:436: in init_process_group
    store, rank, world_size = next(rendezvous_iterator)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

url = 'tcp://localhost:42476?rank=0&world_size=2'
timeout = datetime.timedelta(seconds=1800), kwargs = {}
_error = <function _tcp_rendezvous_handler.<locals>._error at 0x7f19f85c5e18>
result = ParseResult(scheme='tcp', netloc='localhost:42476', path='', params='', query='rank=0&world_size=2', fragment='')
query = {'rank': '0', 'world_size': '2'}, rank = 0, world_size = 2
start_daemon = True

    def _tcp_rendezvous_handler(url, timeout=default_pg_timeout, **kwargs):
        def _error(msg):
            return _rendezvous_error("tcp:// rendezvous: " + msg)
    
        result = urlparse(url)
        if not result.port:
            raise _error("port number missing")
        query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
        if "rank" not in query:
            raise _error("rank parameter missing")
        if "world_size" not in query:
            raise _error("world size parameter missing")
    
        rank = int(query["rank"])
        world_size = int(query["world_size"])
        start_daemon = rank == 0
>       store = TCPStore(result.hostname, result.port, world_size, start_daemon, timeout)
E       RuntimeError: Address already in use

../venv/lib/python3.7/site-packages/torch/distributed/rendezvous.py:133: RuntimeError

@klshuster
Copy link
Contributor

self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>

    @contextmanager
    def _error_catcher(self):
        """
        Catch low-level python exceptions, instead re-raising urllib3
        variants, so that low-level exceptions are not leaked in the
        high-level api.
    
        On exit, release the connection back to the pool.
        """
        clean_exit = False
    
        try:
            try:
>               yield

../venv/lib/python3.7/site-packages/urllib3/response.py:436: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>, amt = 32768
decode_content = True, cache_content = False

    def read(self, amt=None, decode_content=None, cache_content=False):
        """
        Similar to :meth:`httplib.HTTPResponse.read`, but with two additional
        parameters: ``decode_content`` and ``cache_content``.
    
        :param amt:
            How much of the content to read. If specified, caching is skipped
            because it doesn't make sense to cache partial content as the full
            response.
    
        :param decode_content:
            If True, will attempt to decode the body based on the
            'content-encoding' header.
    
        :param cache_content:
            If True, will save the returned data such that the same result is
            returned despite of the state of the underlying file object. This
            is useful if you want the ``.data`` property to continue working
            after having ``.read()`` the file object. (Overridden if ``amt`` is
            set.)
        """
        self._init_decoder()
        if decode_content is None:
            decode_content = self.decode_content
    
        if self._fp is None:
            return
    
        flush_decoder = False
        fp_closed = getattr(self._fp, "closed", False)
    
        with self._error_catcher():
            if amt is None:
                # cStringIO doesn't like amt=None
                data = self._fp.read() if not fp_closed else b""
                flush_decoder = True
            else:
                cache_content = False
>               data = self._fp.read(amt) if not fp_closed else b""

../venv/lib/python3.7/site-packages/urllib3/response.py:518: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <http.client.HTTPResponse object at 0x7fe304f07dd8>, amt = 32768

    def read(self, amt=None):
        if self.fp is None:
            return b""
    
        if self._method == "HEAD":
            self._close_conn()
            return b""
    
        if amt is not None:
            # Amount is given, implement using readinto
            b = bytearray(amt)
>           n = self.readinto(b)

/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/http/client.py:447: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <http.client.HTTPResponse object at 0x7fe304f07dd8>
b = bytearray(b'\x1f\x8b\x08\x00\x9a%\xef[\x00\x03\xec\xbd\xdb\x92\x1cG\x92%X\xcf\xfe\x15\x9e\xf5\x02\x96H2\x05\xbc\x15Y]\...0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')

    def readinto(self, b):
        """Read up to len(b) bytes into bytearray b and return the number
        of bytes read.
        """
    
        if self.fp is None:
            return 0
    
        if self._method == "HEAD":
            self._close_conn()
            return 0
    
        if self.chunked:
            return self._readinto_chunked(b)
    
        if self.length is not None:
            if len(b) > self.length:
                # clip the read to the "end of response"
                b = memoryview(b)[0:self.length]
    
        # we do not use _safe_read() here because this may be a .will_close
        # connection, and the user is reading more bytes than will be provided
        # (for example, reading in 1k chunks)
>       n = self.fp.readinto(b)

/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/http/client.py:491: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <socket.SocketIO object at 0x7fe304f07da0>
b = <memory at 0x7fe304f0a108>

    def readinto(self, b):
        """Read up to len(b) bytes into the writable buffer *b* and return
        the number of bytes read.  If the socket is non-blocking and no bytes
        are available, None is returned.
    
        If *b* is non-empty, a 0 return value indicates that the connection
        was shutdown at the other end.
        """
        self._checkClosed()
        self._checkReadable()
        if self._timeout_occurred:
            raise OSError("cannot read from timed out object")
        while True:
            try:
>               return self._sock.recv_into(b)

/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/socket.py:589: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <ssl.SSLSocket [closed] fd=-1, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6>
buffer = <memory at 0x7fe304f0a108>, nbytes = 31183, flags = 0

    def recv_into(self, buffer, nbytes=None, flags=0):
        self._checkClosed()
        if buffer and (nbytes is None):
            nbytes = len(buffer)
        elif nbytes is None:
            nbytes = 1024
        if self._sslobj is not None:
            if flags != 0:
                raise ValueError(
                  "non-zero flags not allowed in calls to recv_into() on %s" %
                  self.__class__)
>           return self.read(nbytes, buffer)

/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/ssl.py:1049: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <ssl.SSLSocket [closed] fd=-1, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6>
len = 31183, buffer = <memory at 0x7fe304f0a108>

    def read(self, len=1024, buffer=None):
        """Read up to LEN bytes and return them.
        Return zero-length string on EOF."""
    
        self._checkClosed()
        if self._sslobj is None:
            raise ValueError("Read on closed or unwrapped SSL socket.")
        try:
            if buffer is not None:
>               return self._sslobj.read(len, buffer)
E               ConnectionResetError: [Errno 104] Connection reset by peer

/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/ssl.py:908: ConnectionResetError

During handling of the above exception, another exception occurred:

    def generate():
        # Special case for urllib3.
        if hasattr(self.raw, 'stream'):
            try:
>               for chunk in self.raw.stream(chunk_size, decode_content=True):

../venv/lib/python3.7/site-packages/requests/models.py:753: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>, amt = 32768
decode_content = True

    def stream(self, amt=2 ** 16, decode_content=None):
        """
        A generator wrapper for the read() method. A call will block until
        ``amt`` bytes have been read from the connection or until the
        connection is closed.
    
        :param amt:
            How much of the content to read. The generator will return up to
            much data per iteration, but may return less. This is particularly
            likely when using compressed data. However, the empty string will
            never be returned.
    
        :param decode_content:
            If True, will attempt to decode the body based on the
            'content-encoding' header.
        """
        if self.chunked and self.supports_chunked_reads():
            for line in self.read_chunked(amt, decode_content=decode_content):
                yield line
        else:
            while not is_fp_closed(self._fp):
>               data = self.read(amt=amt, decode_content=decode_content)

../venv/lib/python3.7/site-packages/urllib3/response.py:575: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>, amt = 32768
decode_content = True, cache_content = False

    def read(self, amt=None, decode_content=None, cache_content=False):
        """
        Similar to :meth:`httplib.HTTPResponse.read`, but with two additional
        parameters: ``decode_content`` and ``cache_content``.
    
        :param amt:
            How much of the content to read. If specified, caching is skipped
            because it doesn't make sense to cache partial content as the full
            response.
    
        :param decode_content:
            If True, will attempt to decode the body based on the
            'content-encoding' header.
    
        :param cache_content:
            If True, will save the returned data such that the same result is
            returned despite of the state of the underlying file object. This
            is useful if you want the ``.data`` property to continue working
            after having ``.read()`` the file object. (Overridden if ``amt`` is
            set.)
        """
        self._init_decoder()
        if decode_content is None:
            decode_content = self.decode_content
    
        if self._fp is None:
            return
    
        flush_decoder = False
        fp_closed = getattr(self._fp, "closed", False)
    
        with self._error_catcher():
            if amt is None:
                # cStringIO doesn't like amt=None
                data = self._fp.read() if not fp_closed else b""
                flush_decoder = True
            else:
                cache_content = False
>               data = self._fp.read(amt) if not fp_closed else b""

../venv/lib/python3.7/site-packages/urllib3/response.py:518: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <contextlib._GeneratorContextManager object at 0x7fe304f106a0>
type = <class 'ConnectionResetError'>
value = ConnectionResetError(104, 'Connection reset by peer')
traceback = <traceback object at 0x7fe304f01f88>

    def __exit__(self, type, value, traceback):
        if type is None:
            try:
                next(self.gen)
            except StopIteration:
                return False
            else:
                raise RuntimeError("generator didn't stop")
        else:
            if value is None:
                # Need to force instantiation so we can reliably
                # tell if we get the same exception back
                value = type()
            try:
>               self.gen.throw(type, value, traceback)

/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/contextlib.py:130: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>

    @contextmanager
    def _error_catcher(self):
        """
        Catch low-level python exceptions, instead re-raising urllib3
        variants, so that low-level exceptions are not leaked in the
        high-level api.
    
        On exit, release the connection back to the pool.
        """
        clean_exit = False
    
        try:
            try:
                yield
    
            except SocketTimeout:
                # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but
                # there is yet no clean way to get at it from this context.
                raise ReadTimeoutError(self._pool, None, "Read timed out.")
    
            except BaseSSLError as e:
                # FIXME: Is there a better way to differentiate between SSLErrors?
                if "read operation timed out" not in str(e):  # Defensive:
                    # This shouldn't happen but just in case we're missing an edge
                    # case, let's avoid swallowing SSL errors.
                    raise
    
                raise ReadTimeoutError(self._pool, None, "Read timed out.")
    
            except (HTTPException, SocketError) as e:
                # This includes IncompleteRead.
>               raise ProtocolError("Connection broken: %r" % e, e)
E               urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))

../venv/lib/python3.7/site-packages/urllib3/response.py:454: ProtocolError

During handling of the above exception, another exception occurred:

self = <test_transresnet_multimodal.TestTransresnet testMethod=test_transresnet>

    def test_transresnet(self):
        """
        Test pretrained model.
        """
>       _, test = testing_utils.eval_model(MODEL_OPTIONS, skip_valid=True)

tests/nightly/gpu/test_transresnet_multimodal.py:44: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
parlai/utils/testing.py:361: in eval_model
    test = None if skip_test else ems.EvalModel.main(**opt)
parlai/core/script.py:126: in main
    return cls._run_kwargs(kwargs)
parlai/core/script.py:92: in _run_kwargs
    return cls._run_from_parser_and_opt(opt, parser)
parlai/core/script.py:107: in _run_from_parser_and_opt
    return script.run()
parlai/scripts/eval_model.py:222: in run
    return eval_model(self.opt)
parlai/scripts/eval_model.py:191: in eval_model
    agent = create_agent(opt, requireModelExists=True)
parlai/core/agents.py:402: in create_agent
    model = create_agent_from_opt_file(opt)
parlai/core/agents.py:355: in create_agent_from_opt_file
    return model_class(opt_from_file)
projects/image_chat/transresnet_multimodal/transresnet_multimodal.py:70: in __init__
    super().__init__(opt, shared)
projects/personality_captions/transresnet/transresnet.py:101: in __init__
    self.personalities_list = self.load_personalities()
projects/personality_captions/transresnet/transresnet.py:193: in load_personalities
    build(self.opt)
parlai/tasks/personality_captions/build.py:32: in build
    downloadable_file.download_file(dpath)
parlai/core/build_data.py:87: in download_file
    download(self.url, dpath, self.file_name)
parlai/core/build_data.py:181: in download
    for chunk in response.iter_content(CHUNK_SIZE):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    def generate():
        # Special case for urllib3.
        if hasattr(self.raw, 'stream'):
            try:
                for chunk in self.raw.stream(chunk_size, decode_content=True):
                    yield chunk
            except ProtocolError as e:
>               raise ChunkedEncodingError(e)
E               requests.exceptions.ChunkedEncodingError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))

../venv/lib/python3.7/site-packages/requests/models.py:756: ChunkedEncodingError

@klshuster
Copy link
Contributor

self = <tests.test_tra.TestTransformerRanker testMethod=test_train_batch_all>

    @testing_utils.retry(ntries=3)
    def test_train_batch_all(self):
        args = self._get_args()
        args['candidates'] = 'batch-all-cands'
        args['eval_candidates'] = 'batch-all-cands'
        valid, test = testing_utils.train_model(args)
        threshold = self._get_threshold()
    
>       self.assertGreaterEqual(valid['hits@1'], threshold)
E       AssertionError: AverageMetric(0.5) not greater than or equal to 0.8

tests/test_tra.py:92: AssertionError

@klshuster
Copy link
Contributor

The following test, along with several others, failed with this error

other tests:

  • test_chunked_teacher
  • test_distributed_eval_max_exs
  • test_distributed_eval_stream_mode
  • test_distributed_eval_stream_mode_max_exs
  • test_generator_distributed
  • test_multitask_distributed
self = <tests.test_distributed.TestZero2 testMethod=test_chunked_dynamic_teacher>

    def test_chunked_dynamic_teacher(self):
        valid, test = self._distributed_train_model(
            task='integration_tests',
            num_epochs=0.01,
            datatype='train:stream',
            dynamic_batching='full',
>           truncate=16,
        )

tests/test_distributed.py:146: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
tests/test_distributed.py:33: in _distributed_train_model
    valid, test = mp_train.launch_and_train(popt)
parlai/scripts/multiprocessing_train.py:75: in launch_and_train
    retval = multiprocess_train(0, opt, port)
parlai/scripts/multiprocessing_train.py:45: in multiprocess_train
    return single_train.TrainLoop(opt).train()
parlai/scripts/train_model.py:347: in __init__
    self.agent = create_agent(opt)
parlai/core/agents.py:479: in create_agent
    model = model_class(opt)
parlai/core/torch_generator_agent.py:484: in __init__
    self.model = fsdp_utils.fsdp_wrap(self.build_model())
parlai/utils/fsdp.py:111: in fsdp_wrap
    return wrap(module)
../venv/lib/python3.7/site-packages/fairscale/nn/wrap/auto_wrap.py:170: in wrap
    return ConfigAutoWrap.wrapper_cls(module, **wrap_overrides)
../venv/lib/python3.7/site-packages/fairscale/nn/data_parallel/fully_sharded_data_parallel.py:300: in __init__
    self._fsdp_wrapped_module: nn.Module = FlattenParamsWrapper(module, param_list=params)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = FlattenParamsWrapper(
  (_fpw_module): TransformerGeneratorModel_Swappable(
    (embeddings): Embedding(11, 8, padding...   (norm3): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
            )
          )
        )
      )
    )
  )
)
module = TransformerGeneratorModel_Swappable(
  (embeddings): Embedding(11, 8, padding_idx=0)
  (encoder): TransformerEncoder_S...   )
            (norm3): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          )
        )
      )
    )
  )
)
param_list = [[Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0....  [ 6.5029e-01, -7.5969e-01,  9.9749e-01,  7.0737e-02,  1.4944e-01,
          9.8877e-01,  1.4999e-02,  9.9989e-01]])]]

    def __init__(self, module: nn.Module, param_list: ParamGroups = None):
        super().__init__()
        self._fpw_module = module
        self.is_flattened = False
    
        # Handle param_list being None.
        if param_list is None:
            param_list = list(module.parameters())
    
        # Be backward compatible and turn a single param list into a list of
        # a single list.
        if len(param_list) > 0 and isinstance(param_list[0], nn.Parameter):
            param_list = [cast(List[nn.Parameter], param_list)]
    
        # Since the parameters will be deleted, let's record the number original
        # parameters managed by this class. This and get_param_views function
        # below are used by fsdp_optim_utils.py to save/restore optimizer state,
        # which mirrors the flatten parameters here.
        self.num_params_managed = 0
    
        self._param_sets = []
        overall_param_set: Set[nn.Parameter] = set()
        for p_list in param_list:
            # Remove any duplicates from the list.
            p_set: Set[nn.Parameter] = set(cast(List[nn.Parameter], p_list))
    
            self.num_params_managed += len(p_set)
            overall_param_set = overall_param_set.union(p_set)
    
            # Convert from list of Parameters to set of (Module, name) tuples,
            # which will survive in case the parameter instances are reset.
            # Also, a shared param will correctly appear under multiple modules
            # as they should.
            new_p_set_with_names = set()
            for m in self.modules():
                for n, p in m.named_parameters(recurse=False):
                    if p in p_set:
                        new_p_set_with_names.add((m, n))
            if new_p_set_with_names:
                self._param_sets.append(new_p_set_with_names)
    
        if len(overall_param_set) != self.num_params_managed:
            # Each p_list above could have shared params. However, you can't
            # have shared params cross different p_list. That means part of
            # the flattened parameter must be shared, which is impossible to
            # support.
            raise ValueError(f"Incorrect param groups {len(overall_param_set)} vs {self.num_param_managed}")
    
        self.flat_params: List[FlatParameter] = []
        self._param_infos: List[Tuple[str, nn.Module, str]] = []
        self._shared_param_infos: List[Tuple[nn.Module, str, nn.Module, str]] = []
    
        # Init all flat_params.
        for new_p_set in self._param_sets:
            params = self._init_flatten_params(new_p_set)
            assert (
                len(set(p.requires_grad for p in params)) == 1
>           ), "expects all parameters in the same parameter group of the module to have same requires_grad"
E           AssertionError: expects all parameters in the same parameter group of the module to have same requires_grad

../venv/lib/python3.7/site-packages/fairscale/nn/misc/flatten_params_wrapper.py:187: AssertionError

@stephenroller stephenroller unpinned this issue Mar 29, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

3 participants